In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Read the dataset
diabetes = pd.read_csv("diabetes_dataset.csv")

# Extract information from the dataset
num_features = diabetes.shape[1] - 1
num_instances = diabetes.shape[0]
missing_values = diabetes.isnull().sum().sum()

# Outlier detection using Z-score method
z_scores = np.abs((diabetes - diabetes.mean()) / diabetes.std())
outliers = (z_scores > 3).sum().sum()

# Data separation into X and Y
y = diabetes["Outcome"]
x = diabetes.drop('Outcome', axis=1)

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model Building

# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)

# Evaluate model performance for Decision Tree
dt_train_accuracy = accuracy_score(Y_train, dt_model.predict(X_train))
dt_test_accuracy = accuracy_score(Y_test, dt_model.predict(X_test))

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, Y_train)

# Evaluate model performance for Random Forest
rf_train_accuracy = accuracy_score(Y_train, rf_model.predict(X_train))
rf_test_accuracy = accuracy_score(Y_test, rf_model.predict(X_test))

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, Y_train)

# Evaluate model performance for Logistic Regression
lr_train_accuracy = accuracy_score(Y_train, lr_model.predict(X_train))
lr_test_accuracy = accuracy_score(Y_test, lr_model.predict(X_test))

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train, Y_train)

# Evaluate model performance for Support Vector Machine
svm_train_accuracy = accuracy_score(Y_train, svm_model.predict(X_train))
svm_test_accuracy = accuracy_score(Y_test, svm_model.predict(X_test))

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, Y_train)

# Evaluate model performance for Naive Bayes
nb_train_accuracy = accuracy_score(Y_train, nb_model.predict(X_train))
nb_test_accuracy = accuracy_score(Y_test, nb_model.predict(X_test))

# K-Nearest Neighbors
knn_model = KNeighborsClassifier()
if len(X_train) > 5:
    knn_model.fit(X_train, Y_train)

    # Evaluate model performance for K-Nearest Neighbors
    knn_train_accuracy = accuracy_score(Y_train, knn_model.predict(X_train))
    knn_test_accuracy = accuracy_score(Y_test, knn_model.predict(X_test))
else:
    knn_train_accuracy = '-'
    knn_test_accuracy = '-'
    warnings.warn("K-Nearest Neighbors model requires at least 5 samples.")

# Create a DataFrame for model results
model_results = pd.DataFrame([
    ['Decision Tree', round(dt_train_accuracy, 2), round(dt_test_accuracy, 2)],
    ['Random Forest', round(rf_train_accuracy, 2), round(rf_test_accuracy, 2)],
    ['Logistic Regression', round(lr_train_accuracy, 2), round(lr_test_accuracy, 2)],
    ['Support Vector Machine', round(svm_train_accuracy, 2), round(svm_test_accuracy, 2)],
    ['Naive Bayes', round(nb_train_accuracy, 2), round(nb_test_accuracy, 2)],
    ['K-Nearest Neighbors', knn_train_accuracy, knn_test_accuracy]
], columns=['Method', 'Training Accuracy', 'Test Accuracy'])

# Convert K-Nearest Neighbors accuracy scores to string format
model_results['Training Accuracy'] = model_results['Training Accuracy'].astype(str)
model_results['Test Accuracy'] = model_results['Test Accuracy'].astype(str)

# Handle missing values for K-Nearest Neighbors accuracy scores
model_results.loc[model_results['Method'] == 'K-Nearest Neighbors', 'Training Accuracy'] = knn_train_accuracy
model_results.loc[model_results['Method'] == 'K-Nearest Neighbors', 'Test Accuracy'] = knn_test_accuracy

# Print the model results
print(model_results)


# Print the extracted information
print("Dataset ID: Dataset")
print("No. of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print()
print("Accuracy Scores:")
print(model_results)
print()
print("Best Algorithm: Random Forest")


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import warnings

# Read the dataset
diabetes = pd.read_csv("diabetes_dataset.csv")

# Extract information from the dataset
num_features = diabetes.shape[1] - 1
num_instances = diabetes.shape[0]
missing_values = diabetes.isnull().sum().sum()
outliers = "Not implemented"  # You can add outlier detection logic here
feature_correlations = diabetes.corr()

# Data Separation into X and Y
y = diabetes["Outcome"]
x = diabetes.drop('Outcome', axis=1)

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model Building
# Decision Tree
# Training the model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, Y_train)

# Checking the performance of the model on the training set
y_dt_train_pred = dt_model.predict(X_train)
y_dt_test_pred = dt_model.predict(X_test)

# Evaluate model performance for Decision Tree
dt_train_mse = mean_squared_error(Y_train, y_dt_train_pred)
dt_train_r2 = r2_score(Y_train, y_dt_train_pred)
dt_test_mse = mean_squared_error(Y_test, y_dt_test_pred)
dt_test_r2 = r2_score(Y_test, y_dt_test_pred)

# Random Forest Model
rf_model = RandomForestRegressor(max_depth=2, random_state=100)
rf_model.fit(X_train, Y_train)

# Applying the model to make predictions
y_rf_train_pred = rf_model.predict(X_train)
y_rf_test_pred = rf_model.predict(X_test)

# Evaluate model performance for Random Forest
rf_train_mse = mean_squared_error(Y_train, y_rf_train_pred)
rf_train_r2 = r2_score(Y_train, y_rf_train_pred)
rf_test_mse = mean_squared_error(Y_test, y_rf_test_pred)
rf_test_r2 = r2_score(Y_test, y_rf_test_pred)

# Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train, Y_train)

# Applying the model to make predictions
y_lr_train_pred = lr_model.predict(X_train)
y_lr_test_pred = lr_model.predict(X_test)

# Evaluate model performance for Logistic Regression
lr_train_accuracy = accuracy_score(Y_train, y_lr_train_pred)
lr_test_accuracy = accuracy_score(Y_test, y_lr_test_pred)

# Support Vector Machine Model
svm_model = SVC()
svm_model.fit(X_train, Y_train)

# Applying the model to make predictions
y_svm_train_pred = svm_model.predict(X_train)
y_svm_test_pred = svm_model.predict(X_test)

# Evaluate model performance for Support Vector Machine
svm_train_accuracy = accuracy_score(Y_train, y_svm_train_pred)
svm_test_accuracy = accuracy_score(Y_test, y_svm_test_pred)

# Naive Bayes Model
nb_model = GaussianNB()
nb_model.fit(X_train, Y_train)

# Applying the model to make predictions
y_nb_train_pred = nb_model.predict(X_train)
y_nb_test_pred = nb_model.predict(X_test)

# Evaluate model performance for Naive Bayes
nb_train_accuracy = accuracy_score(Y_train, y_nb_train_pred)
nb_test_accuracy = accuracy_score(Y_test, y_nb_test_pred)

# K-Nearest Neighbors Model
knn_model = KNeighborsClassifier()
if len(X_train) > 5:
    knn_model.fit(X_train, Y_train)

    # Applying the model to make predictions
    y_knn_train_pred = knn_model.predict(X_train)
    y_knn_test_pred = knn_model.predict(X_test)

    # Evaluate model performance for K-Nearest Neighbors
    knn_train_accuracy = accuracy_score(Y_train, y_knn_train_pred)
    knn_test_accuracy = accuracy_score(Y_test, y_knn_test_pred)
else:
    knn_train_accuracy = '-'
    knn_test_accuracy = '-'
    warnings.warn("K-Nearest Neighbors model requires at least 5 samples.")

# Create a DataFrame for model results
model_results = pd.DataFrame([
    ['Decision Tree', dt_train_mse, dt_train_r2, dt_test_mse, dt_test_r2],
    ['Random Forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2],
    ['Logistic Regression', '-', '-', '-', '-'],
    ['Support Vector Machine', '-', '-', '-', '-'],
    ['Naive Bayes', '-', '-', '-', '-'],
    ['K-Nearest Neighbors', '-', '-', '-', '-']
], columns=['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2'])

# Update the accuracy scores in the DataFrame
model_results.loc[model_results['Method'] == 'Logistic Regression', 'Training MSE'] = f"{lr_train_accuracy:.4f}"
model_results.loc[model_results['Method'] == 'Logistic Regression', 'Test MSE'] = f"{lr_test_accuracy:.4f}"
model_results.loc[model_results['Method'] == 'Support Vector Machine', 'Training MSE'] = f"{svm_train_accuracy:.4f}"
model_results.loc[model_results['Method'] == 'Support Vector Machine', 'Test MSE'] = f"{svm_test_accuracy:.4f}"
model_results.loc[model_results['Method'] == 'Naive Bayes', 'Training MSE'] = f"{nb_train_accuracy:.4f}"
model_results.loc[model_results['Method'] == 'Naive Bayes', 'Test MSE'] = f"{nb_test_accuracy:.4f}"
model_results.loc[model_results['Method'] == 'K-Nearest Neighbors', 'Training MSE'] = knn_train_accuracy
model_results.loc[model_results['Method'] == 'K-Nearest Neighbors', 'Test MSE'] = knn_test_accuracy

# Print the extracted information
print("Dataset ID: Dataset")
print("No. of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Feature Correlations:\n", feature_correlations)
print()
print("Accuracy Scores:")
print("Decision Tree R2 Score (Training):", dt_train_r2)
print("Decision Tree R2 Score (Test):", dt_test_r2)
print("Random Forest R2 Score (Training):", rf_train_r2)
print("Random Forest R2 Score (Test):", rf_test_r2)
print("Logistic Regression Accuracy (Training):", lr_train_accuracy)
print("Logistic Regression Accuracy (Test):", lr_test_accuracy)
print("Support Vector Machine Accuracy (Training):", svm_train_accuracy)
print("Support Vector Machine Accuracy (Test):", svm_test_accuracy)
print("Naive Bayes Accuracy (Training):", nb_train_accuracy)
print("Naive Bayes Accuracy (Test):", nb_test_accuracy)
print("K-Nearest Neighbors Accuracy (Training):", knn_train_accuracy)
print("K-Nearest Neighbors Accuracy (Test):", knn_test_accuracy)
print()



Dataset ID: Dataset
No. of Features: 9
Number of Instances: 5
Missing Values: 0
Outliers: Not implemented
Feature Correlations:
                           Unnamed: 0  Pregnancies   Glucose  BloodPressure   
Unnamed: 0                  1.000000    -0.532414 -0.068648      -0.813326  \
Pregnancies                -0.532414     1.000000  0.797993       0.509708   
Glucose                    -0.068648     0.797993  1.000000      -0.104300   
BloodPressure              -0.813326     0.509708 -0.104300       1.000000   
SkinThickness              -0.065341    -0.620394 -0.489174      -0.225860   
Insulin                     0.890225    -0.683764 -0.178347      -0.845910   
BMI                         0.418270    -0.473810  0.030986      -0.738771   
DiabetesPedigreeFunction    0.586717    -0.288684  0.334031      -0.926860   
Age                        -0.665446     0.493862  0.483465       0.216797   
Outcome                     0.000000     0.563547  0.911585      -0.322832   

            



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('diabetes_dataset.csv')

# Separate features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate different classifiers
classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

results = {}
for clf_name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[clf_name] = accuracy

# Display the accuracy scores
for clf_name, accuracy in results.items():
    print(f'{clf_name}: {accuracy:.2%}')


ValueError: Expected n_neighbors <= n_samples,  but n_samples = 4, n_neighbors = 5