In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('dark_background')
sns.set_style('whitegrid')

dataset = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

dataset

In [None]:
dataset.info()

In [None]:
dataset = dataset.drop(['id'], axis=1)
dataset

In [None]:
plt.style.use('dark_background')
sns.set_style('whitegrid')

sns.pairplot(dataset, hue ="stroke", palette ='Accent')

In [None]:
desp = dataset[dataset.columns[(dataset.dtypes == 'object')]].describe()
despCol = np.sort(desp.columns)
despCol

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(25, 10))

plt.style.use('dark_background')
sns.set_style('whitegrid')

axesList = [ax1, ax2, ax3, ax4]

for index, value in enumerate(despCol):
    if (index == 4): break
    sns.histplot(data = dataset, x = value, ax = axesList[index], label=value, hue=dataset['stroke'], palette='Accent')
    
fig, ax = plt.subplots(1, 1, figsize=(25, 5))
sns.histplot(data = dataset, x = despCol[-1], ax = ax, label = despCol[-1], hue=dataset['stroke'], palette='Accent')

for index, value in enumerate(despCol):
    print ('\n', value, ' : \n \n \n', dataset[value].value_counts())

In [None]:
desp = dataset[dataset.columns[(dataset.dtypes == 'int64')]].describe()
despCol = np.sort(desp.columns)
despCol

In [None]:
sns.jointplot(data = dataset, y = 'age', x='stroke', hue='gender', kind='kde')

In [None]:
dataset.info()

In [None]:
dataset

In [None]:
dataset.drop(['Residence_type'], axis = 1, inplace = True)

In [None]:
dataset.describe()

In [None]:
dataset['bmi'].isna().value_counts()

In [None]:
bmiFilter = dataset['bmi'][dataset['bmi'].isna()].index
dataset.iloc[bmiFilter, : ]

In [None]:
ageFilter = dataset['age'][dataset['age'].apply(lambda x : x < 20)].index

In [None]:
dataset.iloc[ageFilter, : ]['stroke'].apply(lambda x : x == 1)[[162, 245]]

In [None]:
dataset.iloc[[162, 245], : ]
dataset[dataset.columns[(dataset.dtypes == 'object')]].describe()

In [None]:
for value in dataset[dataset.columns[dataset.dtypes == 'object']].columns:    
    print ('\n \n \n \n', dataset.groupby(value).sum()['stroke'])

# Random Sample Imputation

In [None]:
dataset['bmi'].isna().sum()

In [None]:
randomSample = dataset['bmi'].dropna().sample(dataset['bmi'].isna().sum())
missingIndices = dataset[dataset['bmi'].isna()].index
randomSample.index = missingIndices
dataset.loc[missingIndices, 'bmi'] = randomSample

In [None]:
dataset

In [None]:
sns.heatmap(dataset.corr())

In [None]:
dataset.info()

In [None]:
dataset['gender'].filter('Other')
dataset[dataset['gender'] == 'Other'].index

In [None]:
dataset.drop(dataset[dataset['gender'] == 'Other'].index, axis = 0, inplace = True)

In [None]:
dataset.select_dtypes(np.number)

In [None]:
dataset.select_dtypes(np.object)

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

def labelEncoder(dataset, colName):
    encoder = LabelEncoder()
    encoder.fit(dataset[colName])
    return encoder.transform(dataset[colName])

In [None]:
for col in dataset.select_dtypes(np.object).columns:
    dataset[col] = labelEncoder(dataset, col)

# Functions

In [None]:
from imblearn.under_sampling import RandomUnderSampler

def underSampling():
    rus = RandomUnderSampler(random_state = 0)
    return rus.fit_resample(dataset.iloc[:, :-1], dataset.iloc[:, -1])

In [None]:
from imblearn.over_sampling import SMOTE

def overSampling():
    sm = SMOTE(random_state = 0)
    return sm.fit_resample(dataset.iloc[:, :-1], dataset.iloc[:, -1])

In [None]:
from sklearn.model_selection import train_test_split

def trainTestSplit(x, y):
    return train_test_split(x, y, train_size = 0.80)

In [None]:
from sklearn.metrics import classification_report

def classificationReport(y_test, y_pred):
    return classification_report(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix

def confustionMatrix(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    return ('Total : ', x_test.shape[0], '  Truth : ', confusion[0, 0] + confusion[1, 1], '  Error : ', confusion[0, 1] + confusion[1, 0])

# Model Creation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier as xgb
from lightgbm import LGBMClassifier as lgbm
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

models = {
    "logisticRegression": LogisticRegression(solver = "liblinear"),
    "SVM": SVC(),
    "adaBoost": AdaBoostClassifier(),
    "gbC": GradientBoostingClassifier(),
    "xgb": xgb(),
    "lgbm": lgbm(),
    "catBoost": CatBoostClassifier(logging_level = "Silent"),
    "randomForest": RandomForestClassifier(),
    "decisionTree": DecisionTreeClassifier(),
}

# Function Model Creation

In [None]:
from sklearn.metrics import accuracy_score

def modelTraining(x_train, y_train, x_test, y_test):
    accScore = []
    predictedValues = []
    
    for i in models:
        models[i].fit(x_train, y_train)
        y_pred = models[i].predict(x_test)
        accScore.append(int(accuracy_score(y_pred, y_test) * 100))
        predictedValues.append(y_pred)
    
    return accScore, predictedValues

# Under Sampling

In [None]:
x, y = underSampling()

In [None]:
x_train, x_test, y_train, y_test = trainTestSplit(x, y)

In [None]:
accScore, predictedValues = modelTraining(x_train, y_train, x_test, y_test)

In [None]:
plt.figure(figsize = (25, 8))
ax = sns.barplot(x = list(models.keys()), y = accScore)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy()
    ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')

print (list(models.keys())[accScore.index(max(accScore))], " : " , max(accScore), " %")

In [None]:
from sklearn.metrics import classification_report

for i, j in zip(list(models.keys()), predictedValues):
    print (' \n \n ', i, ' : \n \n', classificationReport(j, y_test), '\n \n ', 'Confusion Matrix : \n \n', confustionMatrix(j, y_test))

# Over Sampling

In [None]:
x, y = overSampling()

In [None]:
x_train, x_test, y_train, y_test = trainTestSplit(x, y)

In [None]:
accScore, predictedValues = modelTraining(x_train, y_train, x_test, y_test)

In [None]:
plt.figure(figsize = (25, 8))
ax = sns.barplot(x = list(models.keys()), y = accScore)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy()
    ax.annotate(f'{round(height,2)}%', (x + width / 2, y + height * 1.02), ha='center', fontsize = 'x-large')

print (list(models.keys())[accScore.index(max(accScore))], " : " , max(accScore), " %")

In [None]:
from sklearn.metrics import classification_report

for i, j in zip(list(models.keys()), predictedValues):
    print (' \n \n ', i, ' : \n \n', classificationReport(j, y_test), '\n \n ', 'Confusion Matrix : \n \n', confustionMatrix(j, y_test))