In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the dataset from a CSV file
diabetes_data = pd.read_csv('diabetes.csv')

# Display basic information about the dataset
dataset_id = 'Dataset'
num_features = len(diabetes_data.columns) - 1  # Exclude the target column
num_instances = len(diabetes_data)
missing_values = diabetes_data.isnull().sum().sum()
outliers = (np.abs(diabetes_data.drop('Outcome', axis=1) - diabetes_data.drop('Outcome', axis=1).mean()) > 3).sum().sum()
feature_correlations = diabetes_data.corr()['Outcome'].abs().sort_values(ascending=False)
data_type = diabetes_data.dtypes.unique()

print("Dataset ID:", dataset_id)
print("No. of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Feature Correlations:")
print(feature_correlations)
print("Data Type:", data_type)

# Drop instances with missing values
diabetes_data.dropna(inplace=True)

# Split the dataset into features and target
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt) * 100

# Train and evaluate the random forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf) * 100

# Train and evaluate the logistic regression classifier
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr) * 100

# Train and evaluate the support vector machine classifier
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm) * 100

# Train and evaluate the naive bayes classifier
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
y_pred_nb = naive_bayes.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb) * 100

# Train and evaluate the k-nearest neighbors classifier
knn = KNeighborsClassifier(n_neighbors=3)  # Set n_neighbors to a lower value
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn) * 100

# Compare the accuracies of different algorithms
accuracies = {
    'Decision Tree': accuracy_dt,
    'Random Forest': accuracy_rf,
    'Logistic Regression': accuracy_lr,
    'Support Vector Machine': accuracy_svm,
    'Naive Bayes': accuracy_nb,
    'K-Nearest Neighbors': accuracy_knn
}

best_algorithm = max(accuracies, key=accuracies.get)

# Display the results
print("\nAccuracy Scores:")
for algorithm, accuracy in accuracies.items():
    print(algorithm, ":", "{:.2f}%".format(accuracy))

print("\nBest Algorithm:", best_algorithm)

# Display the head of the diabetes dataset
print("\nDiabetes Dataset:")
print(diabetes_data.head())


Dataset ID: Dataset
No. of Features: 8
Number of Instances: 768
Missing Values: 0
Outliers: 4221
Feature Correlations:
Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64
Data Type: [dtype('int64') dtype('float64')]

Accuracy Scores:
Decision Tree : 79.87%
Random Forest : 74.03%
Logistic Regression : 74.68%
Support Vector Machine : 76.62%
Naive Bayes : 76.62%
K-Nearest Neighbors : 64.94%

Best Algorithm: Decision Tree

Diabetes Dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI   
0            6      148             72             35        0  33.6  \
1            1       85             66             29        0  26.6   
2            8      183         

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import warnings

# Load the dataset from a CSV file
diabetes_data = pd.read_csv('diabetes.csv')

# Display basic information about the dataset
dataset_id = 'Dataset'
num_features = len(diabetes_data.columns) - 1  # Exclude the target column
num_instances = len(diabetes_data)
missing_values = diabetes_data.isnull().sum().sum()
outliers = (np.abs(diabetes_data.drop('Outcome', axis=1) - diabetes_data.drop('Outcome', axis=1).mean()) > 3).sum().sum()
feature_correlations = diabetes_data.corr()['Outcome'].abs().sort_values(ascending=False)
data_type = diabetes_data.dtypes.unique()

print("Dataset ID:", dataset_id)
print("No. of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Feature Correlations:")
print(feature_correlations)
print("Data Type:", data_type)

# Drop instances with missing values
diabetes_data.dropna(inplace=True)

# Split the dataset into features and target
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Suppress the convergence warning for logistic regression
warnings.filterwarnings("ignore", category=UserWarning)

# Train and evaluate the decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt) * 100

# Train and evaluate the random forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf) * 100

# Train and evaluate the logistic regression classifier
logistic_regression = LogisticRegression(max_iter=1000)
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr) * 100

# Train and evaluate the support vector machine classifier
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm) * 100

# Train and evaluate the naive bayes classifier
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
y_pred_nb = naive_bayes.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb) * 100

# Train and evaluate the k-nearest neighbors classifier
knn = KNeighborsClassifier(n_neighbors=3)  # Set n_neighbors to a lower value
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn) * 100

# Compare the accuracies of different algorithms
accuracies = {
    'Decision Tree': accuracy_dt,
    'Random Forest': accuracy_rf,
    'Logistic Regression': accuracy_lr,
    'Support Vector Machine': accuracy_svm,
    'Naive Bayes': accuracy_nb,
    'K-Nearest Neighbors': accuracy_knn
}

best_algorithm = max(accuracies, key=accuracies.get)

# Display the results
print("\nAccuracy Scores:")
for algorithm, accuracy in accuracies.items():
    print(algorithm, ":", "{:.2f}%".format(accuracy))

print("\nBest Algorithm:", best_algorithm)




Dataset ID: Dataset
No. of Features: 8
Number of Instances: 768
Missing Values: 0
Outliers: 4221
Feature Correlations:
Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64
Data Type: [dtype('int64') dtype('float64')]

Accuracy Scores:
Decision Tree : 79.87%
Random Forest : 74.68%
Logistic Regression : 74.68%
Support Vector Machine : 76.62%
Naive Bayes : 76.62%
K-Nearest Neighbors : 64.94%

Best Algorithm: Decision Tree
