In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the dataset from a CSV file
diabetes_data = pd.read_csv('diabetes_dataset.csv')

# Display basic information about the dataset
dataset_id = 'Dataset'
num_features = len(diabetes_data.columns) - 1  # Exclude the target column
num_instances = len(diabetes_data)
missing_values = diabetes_data.isnull().sum().sum()
outliers = (np.abs(diabetes_data.drop('Outcome', axis=1) - diabetes_data.drop('Outcome', axis=1).mean()) > 3).sum().sum()
feature_correlations = diabetes_data.corr()['Outcome'].abs().sort_values(ascending=False)
data_type = diabetes_data.dtypes.unique()

print("Dataset ID:", dataset_id)
print("No. of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Feature Correlations:")
print(feature_correlations)
print("Data Type:", data_type)

# Drop instances with missing values
diabetes_data.dropna(inplace=True)

# Split the dataset into features and target
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt) * 100

# Train and evaluate the random forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf) * 100

# Train and evaluate the logistic regression classifier
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr) * 100

# Train and evaluate the support vector machine classifier
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm) * 100

# Train and evaluate the naive bayes classifier
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
y_pred_nb = naive_bayes.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb) * 100

# Train and evaluate the k-nearest neighbors classifier
knn = KNeighborsClassifier(n_neighbors=3)  # Set n_neighbors to a lower value
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn) * 100

# Compare the accuracies of different algorithms
accuracies = {
    'Decision Tree': accuracy_dt,
    'Random Forest': accuracy_rf,
    'Logistic Regression': accuracy_lr,
    'Support Vector Machine': accuracy_svm,
    'Naive Bayes': accuracy_nb,
    'K-Nearest Neighbors': accuracy_knn
}

best_algorithm = max(accuracies, key=accuracies.get)

# Display the results
print("\nAccuracy Scores:")
for algorithm, accuracy in accuracies.items():
    print(algorithm, ":", "{:.2f}%".format(accuracy))

print("\nBest Algorithm:", best_algorithm)

# Display the head of the diabetes dataset
print("\nDiabetes Dataset:")
print(diabetes_data.head())


Dataset ID: Dataset
No. of Features: 9
Number of Instances: 5
Missing Values: 0
Outliers: 25
Feature Correlations:
Outcome                     1.000000
Glucose                     0.911585
Age                         0.646147
DiabetesPedigreeFunction    0.606669
Pregnancies                 0.563547
BMI                         0.422899
BloodPressure               0.322832
SkinThickness               0.100599
Insulin                     0.064545
Unnamed: 0                  0.000000
Name: Outcome, dtype: float64
Data Type: [dtype('int64') dtype('float64')]

Accuracy Scores:
Decision Tree : 100.00%
Random Forest : 100.00%
Logistic Regression : 100.00%
Support Vector Machine : 0.00%
Naive Bayes : 0.00%
K-Nearest Neighbors : 0.00%

Best Algorithm: Decision Tree

Diabetes Dataset:
   Unnamed: 0  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   
0           0            6      148             72             35        0  \
1           1            1       85             66         

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# Load the dataset from a CSV file
diabetes_data = pd.read_csv('diabetes_dataset.csv')

# Display basic information about the dataset
dataset_id = 'Dataset'
num_features = len(diabetes_data.columns) - 1  # Exclude the target column
num_instances = len(diabetes_data)
missing_values = diabetes_data.isnull().sum().sum()
outliers = 'N/A'  # You can add outlier detection techniques here
feature_correlations = 'N/A'  # You can calculate feature correlation here
data_type = diabetes_data.dtypes.unique()

print("Dataset ID:", dataset_id)
print("No. of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Feature Correlations:", feature_correlations)
print("Data Type:", data_type)

# Drop instances with missing values
diabetes_data.dropna(inplace=True)

# Split the dataset into features and target
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt) * 100

# Train and evaluate the random forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf) * 100

# Train and evaluate the logistic regression classifier
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr) * 100

# Train and evaluate the support vector machine classifier
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm) * 100

# Train and evaluate the naive bayes classifier
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
y_pred_nb = naive_bayes.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb) * 100

# Train and evaluate the k-nearest neighbors classifier
knn = KNeighborsClassifier(n_neighbors=3)  # Set n_neighbors to a lower value
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn) * 100

# Compare the accuracies of different algorithms
accuracies = {
    'Decision Tree': accuracy_dt,
    'Random Forest': accuracy_rf,
    'Logistic Regression': accuracy_lr,
    'Support Vector Machine': accuracy_svm,
    'Naive Bayes': accuracy_nb,
    'K-Nearest Neighbors': accuracy_knn
}

best_algorithm = max(accuracies, key=accuracies.get)

# Display the results
print("\nAccuracy Scores:")
for algorithm, accuracy in accuracies.items():
    print(algorithm, ":", "{:.2f}%".format(accuracy))

print("\nBest Algorithm:", best_algorithm)

# Display the head of the diabetes dataset
print("\nDiabetes Dataset:")
print(diabetes_data.head())


Dataset ID: Dataset
No. of Features: 9
Number of Instances: 5
Missing Values: 0
Outliers: N/A
Feature Correlations: N/A
Data Type: [dtype('int64') dtype('float64')]

Accuracy Scores:
Decision Tree : 0.00%
Random Forest : 0.00%
Logistic Regression : 100.00%
Support Vector Machine : 0.00%
Naive Bayes : 0.00%
K-Nearest Neighbors : 0.00%

Best Algorithm: Logistic Regression

Diabetes Dataset:
   Unnamed: 0  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   
0           0            6      148             72             35        0  \
1           1            1       85             66             29        0   
2           2            8      183             64              0        0   
3           3            1       89             66             23       94   
4           4            0      137             40             35      168   

    BMI  DiabetesPedigreeFunction  Age  Outcome  
0  33.6                     0.627   50        1  
1  26.6                     0.351   31 