In [18]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler # Handling categorical data
from sklearn.impute import SimpleImputer # handling missing values
from sklearn.pipeline import Pipeline # Pipelines
from sklearn.compose import ColumnTransformer
# Model 
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [19]:
df = pd.read_csv("data/cleaned_data.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,no,yes,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,no,yes,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,no,no,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,yes,no,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,no,no,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [20]:
## Independent and Dependent column

X  = df.drop(labels=['stroke'], axis=1)
Y  = df['stroke']

In [21]:
# Define which column should be ordinal encoded and which should be one hot encoded
numerical_columns = X.select_dtypes(exclude="object").columns
categorical_columns = X.select_dtypes(include="object").columns

In [22]:
# Numerical pipeline

num_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='median')),
          ('scalar',StandardScaler())
      ]

)


# Categorical pipeline

cat_pipeline = Pipeline(
      steps=[
          ('imputer',SimpleImputer(strategy='most_frequent')),
          ("OneHotEncoder",OneHotEncoder(sparse=False,drop='first')),
          ('scalar',StandardScaler())
      ]

)

proccessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_columns),
    ('cat_pipeline', cat_pipeline, categorical_columns)
])

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.30, random_state=30)

In [24]:
X_train = pd.DataFrame(proccessor.fit_transform(X_train), columns=proccessor.get_feature_names_out())
X_test = pd.DataFrame(proccessor.transform(X_test), columns= proccessor.get_feature_names_out())



In [25]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)
X_test, y_test = smote.fit_resample(X_test, y_test)

In [26]:
## Evaluating the Model
def evaluate_classification_model(true, predicted, prob_predictions=None):
    # Accuracy
    accuracy = accuracy_score(true, predicted)
    
    # Precision
    precision = precision_score(true, predicted)
    
    # Recall
    recall = recall_score(true, predicted)
    
    # F1-Score
    f1 = f1_score(true, predicted)
    
    # Confusion Matrix
    cm = confusion_matrix(true, predicted)
    
    return accuracy, precision, recall, f1,cm


In [27]:
## Train multiple models
## Model Evaluation

models = {
                   "LogisticRegression": LogisticRegression(),
                   "LogisticRegressionCV": LogisticRegressionCV(cv=3),
                   "DecisionTreeClassifier": DecisionTreeClassifier(),
                   "RandomForestClassifier": RandomForestClassifier(),
                   "ExtratreesClassifier": ExtraTreesClassifier(),
                   "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=5),
                   "AdaBoostClassifier": AdaBoostClassifier(),
                   "GradientBoostingClassifier": GradientBoostingClassifier(),
                   "SVC": SVC(),
                   "XGBClassifier": XGBClassifier(),
                   "LGBMClassifier": LGBMClassifier(),
                   "CatBoostClassifier": CatBoostClassifier(silent=True)
}


train_model_list = []
model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_resampled_smote,y_resampled_smote)

    # Make Prediction

    y_pred = model.predict(X_test)

    accuracy, precision, recall, f1,cm =  evaluate_classification_model(y_test,y_pred)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training Performance")
    print("Accuracy", accuracy)
    print("Precision", precision)
    print("Recall score", recall)
    print("f1_score", f1)
    print("Confusion Matrix", cm)


    accuracy_list.append(accuracy)

    print("="*35)

    print("\n")


LogisticRegression
Model Training Performance
Accuracy 0.6758669497523001
Precision 0.7093513058129739
Recall score 0.5958952583156405
f1_score 0.6476923076923077
Confusion Matrix [[1068  345]
 [ 571  842]]


LogisticRegressionCV
Model Training Performance
Accuracy 0.7052370842179759
Precision 0.7255054432348367
Recall score 0.6602972399150743
f1_score 0.6913671730270471
Confusion Matrix [[1060  353]
 [ 480  933]]


DecisionTreeClassifier
Model Training Performance
Accuracy 0.7310686482661005
Precision 0.8765859284890427
Recall score 0.537862703467799
f1_score 0.6666666666666666
Confusion Matrix [[1306  107]
 [ 653  760]]


RandomForestClassifier
Model Training Performance
Accuracy 0.7749469214437368
Precision 0.9480968858131488
Recall score 0.5817409766454352
f1_score 0.7210526315789474
Confusion Matrix [[1368   45]
 [ 591  822]]


ExtratreesClassifier
Model Training Performance
Accuracy 0.7250530785562632
Precision 0.9344262295081968
Recall score 0.4840764331210191
f1_score 0.6377622

In [14]:
categorical_columns

Index(['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type',
       'Residence_type', 'smoking_status'],
      dtype='object')

In [16]:
pre_data.shape

(1, 15)