Import Libraries

In [18]:
# Install statsmodels
!pip install statsmodels

#Installation of required libraries
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
import warnings
warnings.simplefilter(action = "ignore") 




[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: C:\Users\Connor\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Import Dataset

In [11]:
#Reading the dataset
df = pd.read_csv("data/diabetes.csv")
# The first 5 observation units of the data set were accessed.
df.head()
# The size of the data set was examined. It consists of 768 observation units and 9 variables.
df.shape
#Feature information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Data Preprocessing

In [12]:
y = df["Outcome"]
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [13]:
X = df.drop(["Outcome"], axis = 1)
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


Base Models

In [39]:
# Splitting data (assuming X and y are already defined)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)

# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=12345),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=12345),
    'Random Forest': RandomForestClassifier(random_state=12345),
    'Support Vector Machine': SVC(gamma='auto', random_state=12345),
    'Gradient Boosting (XGB)': GradientBoostingClassifier(random_state=12345)
}

# Train all models
trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    trained_models[name] = model # Store trained model
    print(f"{name} trained successfully!")


Logistic Regression trained successfully!
K-Nearest Neighbors trained successfully!
Decision Tree trained successfully!
Random Forest trained successfully!
Support Vector Machine trained successfully!
Gradient Boosting (XGB) trained successfully!


In [40]:
# Select a single test entry
single_entry = X_test.iloc[0:1]  # Keep DataFrame format
actual_label = y_test.iloc[0]    # Get actual label

# Evaluate single entry with each model
print("\nSingle Entry Predictions:")
for name, model in trained_models.items():
    predicted_label = model.predict(single_entry)
    print(f"{name}: Predicted={predicted_label[0]}, Actual={actual_label}")


Single Entry Predictions:
Logistic Regression: Predicted=0, Actual=0
K-Nearest Neighbors: Predicted=0, Actual=0
Decision Tree: Predicted=0, Actual=0
Random Forest: Predicted=0, Actual=0
Support Vector Machine: Predicted=0, Actual=0
Gradient Boosting (XGB): Predicted=0, Actual=0


In [43]:
# Select multiple test entries
num_samples = 10  # Change this to however many samples you want to test
X_samples = X_test.iloc[:num_samples]  # Select the first `num_samples` entries
y_samples = y_test.iloc[:num_samples]  # Get corresponding actual labels

# Create a DataFrame to store results
results_df = pd.DataFrame(columns=['Sample Index', 'Actual'] + list(trained_models.keys()))

# Evaluate multiple entries with each model
for i, (idx, actual_label) in enumerate(y_samples.items()):
    row = {'Sample Index': idx, 'Actual': actual_label}  # Store sample index and actual label
    for name, model in trained_models.items():
        predicted_label = model.predict([X_samples.iloc[i]])[0]  # Predict for one row
        row[name] = predicted_label  # Store prediction
    results_df = results_df.append(row, ignore_index=True)  # Append to results DataFrame

# Print results
print("\nModel Predictions on Multiple Test Entries:")
print(results_df.to_string(index=False))  # Print without DataFrame index

results_df.to_csv("model_predictions.csv", index=False)


Model Predictions on Multiple Test Entries:
Sample Index Actual Logistic Regression K-Nearest Neighbors Decision Tree Random Forest Support Vector Machine Gradient Boosting (XGB)
          68      0                   0                   0             0             0                      0                       0
         308      1                   0                   0             0             0                      0                       1
         375      1                   1                   0             1             1                      0                       1
         498      1                   1                   1             1             1                      0                       1
         188      1                   0                   0             1             0                      0                       1
         150      0                   0                   0             0             0                      0                       0
         3

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler

# Split dataset (assumes X and y are defined)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=12345)

# Scale features for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define individual models
models = [
    ('LR', LogisticRegression(random_state=12345)),
    ('KNN', KNeighborsClassifier()),
    ('CART', DecisionTreeClassifier(random_state=12345)),
    ('RF', RandomForestClassifier(n_estimators=200, random_state=12345)),
    ('SVM', SVC(gamma='auto', random_state=12345)),
    ('XGB', GradientBoostingClassifier(n_estimators=200, random_state=12345))
]

# Create a Voting Classifier with majority voting
voting_clf = VotingClassifier(estimators=models, voting='hard')  # 'hard' for majority vote
voting_clf.fit(X_train, y_train)

# Get predictions for multiple test samples
num_samples = 10  # Change this to however many test samples you want
X_samples = X_test[:num_samples]
y_samples = y_test[:num_samples]

# Store results in a DataFrame
results_df = pd.DataFrame(columns=['Sample Index', 'Actual', 'Final Prediction'])

# Evaluate and print results
for i, (idx, actual_label) in enumerate(y_samples.items()):
    final_prediction = voting_clf.predict([X_samples[i]])[0]  # Get majority vote prediction
    row = {'Sample Index': idx, 'Actual': actual_label, 'Final Prediction': final_prediction}
    results_df = results_df.append(row, ignore_index=True)

# Print results
print("\nMajority Vote Predictions:")
print(results_df.to_string(index=False))  # Print nicely formatted table
