In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the dataset from a CSV file
diabetes_data = pd.read_csv('diabetes.csv')

# Display basic information about the dataset
dataset_id = 'Dataset'
num_features = len(diabetes_data.columns) - 1  # Exclude the target column
num_instances = len(diabetes_data)
missing_values = diabetes_data.isnull().sum().sum()
outliers = (np.abs(diabetes_data.drop('Outcome', axis=1) - diabetes_data.drop('Outcome', axis=1).mean()) > 3).sum().sum()
feature_correlations = diabetes_data.corr()['Outcome'].abs().sort_values(ascending=False)
data_type = diabetes_data.dtypes.unique()

print("Dataset ID:", dataset_id)
print("No. of Features:", num_features)
print("Number of Instances:", num_instances)
print("Missing Values:", missing_values)
print("Outliers:", outliers)
print("Feature Correlations:")
print(feature_correlations)
print("Data Type:", data_type)

# Drop instances with missing values
diabetes_data.dropna(inplace=True)

# Split the dataset into features and target
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=5, min_samples_split=2)
decision_tree.fit(X_train, y_train)
y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt) * 100

# Train and evaluate the random forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf) * 100

# Train and evaluate the logistic regression classifier
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr) * 100

# Train and evaluate the support vector machine classifier
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm) * 100

# Train and evaluate the naive bayes classifier
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
y_pred_nb = naive_bayes.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb) * 100

# Train and evaluate the k-nearest neighbors classifier
knn = KNeighborsClassifier(n_neighbors=3)  # Set n_neighbors to a lower value
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn) * 100

# Compare the accuracies of different algorithms
accuracies = {
    'Decision Tree': accuracy_dt,
    'Random Forest': accuracy_rf,
    'Logistic Regression': accuracy_lr,
    'Support Vector Machine': accuracy_svm,
    'Naive Bayes': accuracy_nb,
    'K-Nearest Neighbors': accuracy_knn
}

best_algorithm = max(accuracies, key=accuracies.get)

# Display the results
print("\nAccuracy Scores:")
for algorithm, accuracy in accuracies.items():
    print(algorithm, ":", "{:.2f}%".format(accuracy))

print("\nBest Algorithm:", best_algorithm)

# Display the head of the diabetes dataset
print("\nDiabetes Dataset:")
print(diabetes_data.head())


Dataset ID: Dataset
No. of Features: 8
Number of Instances: 768
Missing Values: 0
Outliers: 4221
Feature Correlations:
Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64
Data Type: [dtype('int64') dtype('float64')]

Accuracy Scores:
Decision Tree : 79.22%
Random Forest : 74.03%
Logistic Regression : 74.68%
Support Vector Machine : 76.62%
Naive Bayes : 76.62%
K-Nearest Neighbors : 64.94%

Best Algorithm: Decision Tree

Diabetes Dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI   
0            6      148             72             35        0  33.6  \
1            1       85             66             29        0  26.6   
2            8      183         

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the diabetes dataset
diabetes = pd.read_csv("diabetes_dataset.csv")

# Data Separation into X and Y
y = diabetes["DiabetesPedigreeFunction"]
x = diabetes.drop('DiabetesPedigreeFunction', axis=1)

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model Building and Training - Decision Tree
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, Y_train)

# Evaluate Decision Tree model performance
y_model_train_pred = decision_tree_model.predict(X_train)
y_model_test_pred = decision_tree_model.predict(X_test)

model_train_mse = mean_squared_error(Y_train, y_model_train_pred)
model_train_r2 = r2_score(Y_train, y_model_train_pred)
model_test_mse = mean_squared_error(Y_test, y_model_test_pred)
model_test_r2 = r2_score(Y_test, y_model_test_pred)

# Model Building and Training - Random Forest
random_forest_model = RandomForestRegressor(max_depth=2, random_state=100)
random_forest_model.fit(X_train, Y_train)

# Evaluate Random Forest model performance
y_rf_train_pred = random_forest_model.predict(X_train)
y_rf_test_pred = random_forest_model.predict(X_test)

rf_train_mse = mean_squared_error(Y_train, y_rf_train_pred)
rf_train_r2 = r2_score(Y_train, y_rf_train_pred)
rf_test_mse = mean_squared_error(Y_test, y_rf_test_pred)
rf_test_r2 = r2_score(Y_test, y_rf_test_pred)

# Create a DataFrame to store model results
model_results = pd.DataFrame(
    [
        ['Decision Tree', model_train_mse, model_train_r2, model_test_mse, model_test_r2],
        ['Random Forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]
    ],
    columns=['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']
)

# Additional models and accuracy scores
# Add code to train and evaluate other models (e.g., Logistic Regression, SVM, Naive Bayes, KNN) and store their results in the DataFrame

# Dataset information
dataset_info = """
Dataset ID: Dataset
No. of Features: 9
Number of Instances: {}
Missing Values: {}
Outliers: {}
Feature Correlations:
Outcome                     1.000000
Glucose                     0.911585
Age                         0.646147
DiabetesPedigreeFunction    0.606669
Pregnancies                 0.563547
BMI                         0.422899
BloodPressure               0.322832
SkinThickness               0.100599
Insulin                     0.064545
Unnamed: 0                  0.000000
Name: Outcome, dtype: float64
Data Type: [dtype('int64') dtype('float64')]
""".format(len(diabetes), diabetes.isnull().sum().sum(), 25)

# Print the model results and dataset information
print("Model Results:")
print(model_results)
print("\nDataset Information:")
print(dataset_info)


Model Results:
          Method  Training MSE  Training R2  Test MSE  Test R2
0  Decision Tree      0.000000      1.00000  0.033856      NaN
1  Random Forest      0.119277      0.81539  0.060627      NaN

Dataset Information:

Dataset ID: Dataset
No. of Features: 9
Number of Instances: 5
Missing Values: 0
Outliers: 25
Feature Correlations:
Outcome                     1.000000
Glucose                     0.911585
Age                         0.646147
DiabetesPedigreeFunction    0.606669
Pregnancies                 0.563547
BMI                         0.422899
BloodPressure               0.322832
SkinThickness               0.100599
Insulin                     0.064545
Unnamed: 0                  0.000000
Name: Outcome, dtype: float64
Data Type: [dtype('int64') dtype('float64')]





In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Read the dataset
diabetes = pd.read_csv("diabetes_dataset.csv")

# Data Separation as X and Y
y = diabetes["DiabetesPedigreeFunction"]
x = diabetes.drop('DiabetesPedigreeFunction', axis=1)

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define the models
models = {
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(max_depth=2, random_state=100),
    "Linear Regression": LinearRegression(),
    "Support Vector Machine": SVR(),
    "Naive Bayes": GaussianNB(),
    "K-Nearest Neighbors": KNeighborsRegressor()
}

results = []

# Iterate over the models
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, Y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate performance metrics
    train_mse = mean_squared_error(Y_train, y_train_pred)
    train_r2 = r2_score(Y_train, y_train_pred)
    test_mse = mean_squared_error(Y_test, y_test_pred)
    test_r2 = r2_score(Y_test, y_test_pred)

    # Store the results
    results.append([model_name, train_mse, train_r2, test_mse, test_r2])

# Create a dataframe to display the results
columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']
model_results = pd.DataFrame(results, columns=columns)

# Find the model with the highest test R2 score
best_model = model_results.loc[model_results['Test R2'].idxmax()]

# Print the best model and its accuracy
print("Best Model:")
print(best_model['Method'])
print("Test R2:", best_model['Test R2'] * 100, "%")




ValueError: Unknown label type: (array([0.167, 0.627, 0.672, 2.288]),)

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Read the dataset
diabetes = pd.read_csv("diabetes_dataset.csv")

# Data Separation as X and Y
y = diabetes["DiabetesPedigreeFunction"]
x = diabetes.drop('DiabetesPedigreeFunction', axis=1)

# Label encoding for the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Define the models
models = {
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(max_depth=2, random_state=100),
    "Linear Regression": LinearRegression(),
    "Support Vector Machine": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor()
}

results = []

# Iterate over the models
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, Y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate performance metrics
    train_mse = mean_squared_error(Y_train, y_train_pred)
    train_r2 = r2_score(Y_train, y_train_pred)
    test_mse = mean_squared_error(Y_test, y_test_pred)
    test_r2 = r2_score(Y_test, y_test_pred)

    # Store the results
    results.append([model_name, train_mse, train_r2, test_mse, test_r2])

# Create a dataframe to display the results
columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']
model_results = pd.DataFrame(results, columns=columns)

# Find the model with the highest test R2 score
best_model = model_results.loc[model_results['Test R2'].idxmax()]

# Print the best model and its accuracy
print("Best Model:")
print(best_model['Method'])
print("Test R2:", best_model['Test R2'] * 100, "%")




ValueError: Expected n_neighbors <= n_samples,  but n_samples = 4, n_neighbors = 5