In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.head().T

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()

df.drop(columns = 'id', inplace = True)

df = df[df['gender'] != 'Other']

In [None]:
gnd_val = {"Male": 0, "Female": 1}
yn_val = {"No": 0, "Yes": 1}
area_val = {"Rural":0, "Urban":1}

##
df['gender'] = df['gender'].map(gnd_val).astype('int64')
df['ever_married'] = df['ever_married'].map(yn_val).astype('int64')
df['Residence_type'] = df['Residence_type'].map(area_val).astype('int64')

In [None]:
for col in ['work_type', 'smoking_status']:
    dummies = pd.get_dummies(df[col], prefix = col).astype("int")
    df = df.join(dummies)
    df = df.drop(columns = [col])

In [None]:
stke = df['stroke']
df = df.drop(columns=['stroke'])
df.insert(loc=17, column='stroke', value=stke)

In [None]:
df['stroke'].value_counts()

TARGET Set is Imbalanced.

In [None]:
## ML
array = df.values
X = array[:,1:17]
y = array[:,17]

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, recall_score, precision_score

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 45)

In [None]:
## Managing ImBalances
## SMOTE

from imblearn.over_sampling import SMOTE
smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

## ADASYN

from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state = 130)
X_train_ada, y_train_ada = ada.fit_resample(X_train, y_train)

## SMOTE + Tomek Links

from imblearn.combine import SMOTETomek
smtom = SMOTETomek(random_state = 139)
X_train_smtom, y_train_smtom = smtom.fit_resample(X_train, y_train)

## SMOTE + ENN

from imblearn.combine import SMOTEENN
smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X_train, y_train)

In [None]:
## Measuring Model Accuracy
def evaluate_model(clf, X_test, y_test, model_name, oversample_type):
    print('--------------------------------------------')
    print('Model ', model_name)
    print('Data Type ', oversample_type)
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    print(classification_report(y_test, y_pred))
    print("F1 Score ", f1)
    print("Recall ", recall)
    print("Precision ", precision)
    return [model_name, oversample_type, f1, recall, precision]

### Models
models = {   'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(),
    'DecisionTrees': DecisionTreeClassifier(random_state=42),
    'RandomForest':RandomForestClassifier(random_state=42),
    'LinearSVC':LinearSVC(random_state=0),
    'AdaBoostClassifier':AdaBoostClassifier(random_state=42),
    'SGD':SGDClassifier(random_state = 42),
    "CART": DecisionTreeClassifier(random_state = 42)
}

### Data sample
oversampled_data = {
    'ACTUAL':[X_train, y_train],
    'SMOTE':[X_train_sm, y_train_sm],
    'ADASYN':[X_train_ada, y_train_ada],
    'SMOTE_TOMEK':[X_train_smtom, y_train_smtom],
    'SMOTE_ENN':[X_train_smenn, y_train_smenn]
}

### Final Output
final_output = []
for model_k, model_clf in models.items():
    for data_type, data in oversampled_data.items():
        model_clf.fit(data[0], data[1])
        final_output.append(evaluate_model(model_clf, X_val, y_val, model_k, data_type))

In [None]:
final_df = pd.DataFrame(final_output, columns=['Model', 'DataType', 'F1', 'Recall', 'Precision'])

final_df.sort_values(by="F1", ascending=False)

In [None]:
param_grid = {
    'solver':['svd', 'lsqr', 'eigen'],
    'shrinkage':['auto', 0.1, 0.001, 0.0001],
    'store_covariance':[True, False],
    'tol':[0.1, 0.001, 0.0001]
}

In [None]:
LDA = LinearDiscriminantAnalysis()
LDA_cv = GridSearchCV(estimator = LDA, param_grid = param_grid, cv = 5, verbose = 0)

In [None]:
LDA_cv.fit(X_train, y_train)

In [None]:
params = LDA_cv.best_params_
print(params)

In [None]:
LDA_C = LinearDiscriminantAnalysis(shrinkage = params['shrinkage'], 
                                   solver = params['solver'], 
                                   store_covariance = params['store_covariance'], 
                                   tol = params['tol'])

## Fitting
LDA_C.fit(X_train, y_train)

## Evaluate
evaluate_model(LDA_C, X_val, y_val, 'LinearDiscriminantAnalysis', 'Actual Data')

We have a good Accuracy, using the Linear Discriminant Accuracy on the actual Dataset.