In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [5]:
# Loading preprocessed Dataset 
data = pd.read_csv('/Users/sathvikchava/my files/big data/Project files/Diabetes_Health_Indicators_cleanedData1.0.csv')

# Spliting data into features and target
X = data.drop(columns='Diabetes_binary')
y = data['Diabetes_binary']

In [7]:
numerical_Features=['BMI']
categorical_Features=X.columns.drop('BMI')

In [14]:
# Spliting data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_Features),
        ('cat', OneHotEncoder(drop='first'), categorical_Features)
    ]
)

In [18]:
# Function to train model and evaluate each model
def evaluate(model,model_name):
    # creating pipeline with preprocessing and model
    pipeline=Pipeline(
        [
            ('preprocessor', preprocessor),
            ('classifier', model) 
        ]
    )
    # training the model
    pipeline.fit(X_train,y_train)
    # predictions
    y_pred=pipeline.predict(X_test)
    # Evaluate model and print the result
    print(f"---------{model_name}----------")
    print(f" Accuracy:{accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    print("------------------")
    

In [22]:
# Logistic Regression
logistic_reg_model=LogisticRegression(random_state=42)
evaluate(logistic_reg_model, "Logistic Regression")

---------Logistic Regression----------
 Accuracy:0.8636
              precision    recall  f1-score   support

           0       0.88      0.98      0.93     43667
           1       0.54      0.16      0.24      7069

    accuracy                           0.86     50736
   macro avg       0.71      0.57      0.58     50736
weighted avg       0.83      0.86      0.83     50736

------------------


In [24]:
# Random Forest
random_forest_model=RandomForestClassifier(random_state=42)
evaluate(random_forest_model,"Random Forest")

---------Random Forest----------
 Accuracy:0.8583
              precision    recall  f1-score   support

           0       0.88      0.97      0.92     43667
           1       0.47      0.16      0.24      7069

    accuracy                           0.86     50736
   macro avg       0.68      0.56      0.58     50736
weighted avg       0.82      0.86      0.83     50736

------------------


In [26]:
from sklearn.ensemble import AdaBoostClassifier

In [30]:
adaboost_model=AdaBoostClassifier(random_state=42)
evaluate(adaboost_model,"AdaBoost")



---------AdaBoost----------
 Accuracy:0.8621
              precision    recall  f1-score   support

           0       0.88      0.97      0.92     43667
           1       0.51      0.20      0.28      7069

    accuracy                           0.86     50736
   macro avg       0.70      0.58      0.60     50736
weighted avg       0.83      0.86      0.83     50736

------------------


In [42]:
# using cross validaiton
from sklearn.model_selection import StratifiedKFold, cross_val_score
# Function to train model and evaluate each model ( using cross-validation)
def evaluate_cv(model,model_name,X,y):
    pipeline=Pipeline(
        [
            ('preprocessor', preprocessor),
            ('classifier', model)
        ]
    )

    # K-fold cross validation
    kf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy')
    # Output results
    print(f"---------- {model_name} ------------")
    print(f"Mean Accuracy: {cv_score.mean():.4f}")
    print(f"Standard Deviation: {cv_score.std():.4f}")
    print("---------------------------------------")
    

In [44]:
# Logistic Regression Model with Cross validaiton
evaluate_cv(logistic_reg_model, "Logistic Regression", X, y)

---------- Logistic Regression ------------
Mean Accuracy: 0.8651
Standard Deviation: 0.0006
---------------------------------------


In [46]:
# Random Forest Model with Cross validaiton
evaluate_cv(random_forest_model, "Random Forest", X, y)

---------- Random Forest ------------
Mean Accuracy: 0.8594
Standard Deviation: 0.0007
---------------------------------------


In [50]:
# AdaBoost with Cross validaiton
evaluate_cv(adaboost_model, "AdaBoost", X, y)



---------- AdaBoost ------------
Mean Accuracy: 0.8644
Standard Deviation: 0.0002
---------------------------------------
