In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# EDA

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df.head()

In [None]:
df = df.rename(columns={'Residence_type':'residence_type'})

In [None]:
# Converte genero para 0 ou 1
df['gender'] = df['gender'].apply({'Male':1, 'Female':0}.get)

In [None]:
# Converte status de se já foi casado para 0 ou 1
df['ever_married'] = df['ever_married'].apply({'Yes':1, 'No':0}.get)

In [None]:
# Converte o tipo de residência para 0 ou 1
df['residence_type'] = df['residence_type'].apply({'Rural':1, 'Urban':0}.get)

In [None]:
# modificando strings
df['smoking_status'] = df['smoking_status'].apply(str.lower)
df['smoking_status'] = df['smoking_status'].apply(lambda x: x.replace(' ','_'))

In [None]:
# modificando strings
df['work_type'] = df['work_type'].apply(str.lower)
df['work_type'] = df['work_type'].apply(lambda x: x.replace('-','_'))

In [None]:
plt.figure(figsize=(10,7))
sns.boxplot(data=df,x=df["bmi"],color='green');

In [None]:
df["bmi"] = df["bmi"].apply(lambda x: 50 if x>50 else x)
df["bmi"] = df["bmi"].fillna(28.4)

In [None]:
df = pd.get_dummies(df)

In [None]:
df = df[~df.isnull().any(axis=1)]

# **Heatmap Correlation**

In [None]:
plt.figure(figsize = (30,20))
sns.heatmap(df.corr(),annot=True);

# Scaling The variance in Features

In [None]:
std=StandardScaler()
columns = ['avg_glucose_level','bmi','age']
scaled = std.fit_transform(df[['avg_glucose_level','bmi','age']])
scaled = pd.DataFrame(scaled,columns=columns)
df=df.drop(columns=columns,axis=1)

In [None]:
df=df.merge(scaled, left_index=True, right_index=True, how = "left")
df

In [None]:
df = df[~df.isnull().any(axis=1)]

# Class

In [None]:
df_class = df.drop(['id','stroke'], axis=1)
df_target = df['stroke']

# Spliting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_class, df_target, test_size=0.3, random_state=11)

In [None]:
y_test.value_counts()

# adaboost classification

In [None]:
#create adaboost classification obj
ab_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, 
                            learning_rate=0.5, random_state=100)

#training via adaboost classficiation model
ab_clf.fit(X_train, y_train)
print("training....\n")

#make prediction using the test set
ab_pred_stroke= ab_clf.predict(X_train)
print('prediction: \n', ab_pred_stroke)

print('\nparms: \n', ab_clf.get_params)

#score
ab_clf_score = ab_clf.score(X_test, y_test)
print("\nmean accuracy: %.2f" % ab_clf.score(X_test, y_test))

# XGboost

In [None]:
xgboost = GradientBoostingClassifier(random_state=0)
xgboost.fit(X_train, y_train)
#== 
#Score 
#== 
xgboost_score = xgboost.score(X_train, y_train)
xgboost_test = xgboost.score(X_test, y_test)
#== 
#testing model 
#== 
y_pred = xgboost.predict(X_test)
#== 
#evaluation
#== 
cm = confusion_matrix(y_test,y_pred)
print('Training Score',xgboost_score)
print('Testing Score \n',xgboost_test)

#=== 
#Confusion Matrix 
plt.figure(figsize=(14,5))

conf_matrix = pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="Greens");
print(accuracy_score(y_test,y_pred))

# SVM

In [None]:
svc = SVC(random_state=0)
svc.fit(X_train, y_train)
#== 
#Score 
#== 
svc_score = svc.score(X_train, y_train)
svc_test = svc.score(X_test, y_test)
#== 
#testing model 
#== 
y_pred = svc.predict(X_test)
#== 
#evaluation
#== 
cm = confusion_matrix(y_test,y_pred)
print('Training Score',svc_score)
print('Testing Score \n',svc_test)

plt.figure(figsize=(14,5))

conf_matrix = pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="Greens");
print(accuracy_score(y_test,y_pred))

# Random Forest Classifier

In [None]:
forest = RandomForestClassifier(n_estimators = 100)
#== 
forest.fit(X_train, y_train)
#== 
#Score 
#== 
forest_score = forest.score(X_train, y_train)
forest_test = forest.score(X_test, y_test)
#== 
#testing model 
#== 
y_pred = forest.predict(X_test)
#== 
#evaluation
#== 
cm = confusion_matrix(y_test,y_pred)
print('Training Score',forest_score)
print('Testing Score \n',forest_test)

plt.figure(figsize=(14,5))

conf_matrix = pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="Greens");
print(accuracy_score(y_test,y_pred))

# Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
#== 
#Score 
#== 
logistic_score = model.score(X_train, y_train)
logistic_test = model.score(X_test, y_test)
#== 
#testing model 
#== 
y_pred = model.predict(X_test)
#== 
#evaluation
#== 
cm = confusion_matrix(y_test,y_pred)
print('Training Score',logistic_score)
print('Testing Score \n',logistic_test)

plt.figure(figsize=(14,5))

conf_matrix = pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="Greens");
print(accuracy_score(y_test,y_pred))

In [None]:
xgboost.get_params().keys()

In [None]:
pipe = Pipeline([('classifier' , RandomForestClassifier())])


param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))},
    {'classifier' : [SVC()],
     'classifier__C': [0.1,1, 10, 100], 
     'classifier__gamma': [1,0.1,0.01,0.001],
     'classifier__kernel': ['rbf', 'poly', 'sigmoid']},
    {'classifier' : [GradientBoostingClassifier()],
    'classifier__max_depth' : [3,4,5]}
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1, scoring='recall')

# Fit on data

best_clf = clf.fit(X_train, y_train)

In [None]:
best_clf.best_params_

In [None]:
model = SVC(C=100, gamma=0.1, kernel='sigmoid', probability=True)
model.fit(X_train, y_train)
#== 
#Score 
#== 
logistic_score = model.score(X_train, y_train)
logistic_test = model.score(X_test, y_test)
#== 
#testing model 
#== 
y_pred = model.predict(X_test)
#== 
#evaluation
#== 
cm = confusion_matrix(y_test,y_pred)
print('Training Score',logistic_score)
print('Testing Score \n',logistic_test)

plt.figure(figsize=(14,5))

conf_matrix = pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="Greens");
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Indicies of each class' observations
i_class0 = np.where(df['stroke'] == 0)[0]
i_class1 = np.where(df['stroke'] == 1)[0]

# Number of observations in each class
n_class0 = len(i_class0)
n_class1 = len(i_class1)

# For every observation of class 0, randomly sample from class 1 without replacement
i_class0_downsampled = np.random.choice(i_class0, size=n_class1, replace=False)

# Join together class 0's target vector with the downsampled class 1's target vector
df_sample = df.iloc[i_class1].append(df.iloc[i_class0_downsampled])

In [None]:
df_sample.stroke.value_counts()

In [None]:
df_sample_class = df_sample.drop(['id','stroke'], axis=1)
df_sample_target = df_sample['stroke']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_sample_class, df_sample_target, test_size=0.3, random_state=11)

In [None]:
y_test.value_counts()

In [None]:
pipe = Pipeline([('classifier' , RandomForestClassifier())])


param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))},
    {'classifier' : [SVC()],
     'classifier__C': [0.1,1, 10, 100], 
     'classifier__gamma': [1,0.1,0.01,0.001],
     'classifier__kernel': ['rbf', 'poly', 'sigmoid']},
    {'classifier' : [GradientBoostingClassifier()],
    'classifier__max_depth' : [3,4,5]}
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1, scoring='f1')

# Fit on data

best_clf = clf.fit(X_train, y_train)

In [None]:
best_clf.best_params_

In [None]:
model = LogisticRegression(C=0.08858667904100823, penalty='l1', solver='liblinear')
model.fit(X_train, y_train)
#== 
#Score 
#== 
logistic_score = model.score(X_train, y_train)
logistic_test = model.score(X_test, y_test)
#== 
#testing model 
#== 
y_pred = model.predict(X_test)
#== 
#evaluation
#== 
cm = confusion_matrix(y_test,y_pred)
print('Training Score',logistic_score)
print('Testing Score \n',logistic_test)

plt.figure(figsize=(14,5))

conf_matrix = pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="Greens");
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

In [None]:
import pickle
filename = 'model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
filename = 'std.pkl'
pickle.dump(std, open(filename, 'wb'))

In [None]:
std.transform([[30, 23, 20]])

In [None]:
X_train.columns

In [None]:
model.predict_proba([[1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, -1.6815247 , -0.78960167, -1.027359]])