In [None]:
!pip install -U dataprep

In [None]:
!pip install autoviz

In [None]:
!pip install evalml

In [None]:
pip install "dask[distributed]" --upgrade

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()

In [None]:
import dataprep as dp
from dataprep.eda import create_report, plot, correlation, plot_correlation, plot_missing

In [None]:
from evalml.preprocessing import split_data
from evalml.automl import AutoMLSearch
from evalml.utils import infer_feature_types

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from google.colab import files
import io
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RepeatedStratifiedKFold, RepeatedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, RobustScaler, QuantileTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix

In [None]:
file = files.upload()

In [None]:
df_train = pd.read_csv(io.BytesIO(file['healthcare-dataset-stroke-data.csv']))
df_train_copy = pd.read_csv(io.BytesIO(file['healthcare-dataset-stroke-data.csv']))

## Train

In [None]:
df_train

## EDA before Cleaning



*   DataPrep


In [None]:
df_train.describe()

In [None]:
df_train.isna().sum()

In [None]:
df_train.info()

In [None]:
report = create_report(df_train)
report

In [None]:
plot(df_train)

In [None]:
plot_missing(df_train)

In [None]:
plot_correlation(df_train)

In [None]:
plt.boxplot(df_train['bmi'])



---





*   Autoviz



In [None]:
af = AV.AutoViz("",dfte=df_train,verbose=1,depVar='stroke', lowess=True)



---



---



## Labels

In [None]:
labels_train = df_train.pop('stroke')
labels_train

## Cleaning

In [None]:
df_train.drop('id', axis=1, inplace=True)
df_train['bmi'].fillna(df_train['bmi'].mean(), inplace=True)

In [None]:
bins= [0,2,4,13,20,50,110]
labels = ['Infant','Toddler','Kid','Teen','Adult','Senior']
df_train['AgeGroup'] = pd.cut(df_train['age'], bins=bins, labels=labels, right=False)
df_train = df_train.astype({"AgeGroup": object})

In [None]:
category_column = []
numerical_column = []
for col in df_train.columns:
  if df_train[col].dtype == object:
    category_column.append(str(col))
  if ((df_train[col].dtype == int) or (df_train[col].dtype == float)):
    numerical_column.append(str(col))



---



In [None]:
df_train

## EDA after Cleaning

In [None]:
df_train.describe()

In [None]:
df_train.isna().sum()

In [None]:
df_train.info()

In [None]:
report_after_cleaning = create_report(df_train)
report_after_cleaning

In [None]:
plot(df_train)

In [None]:
plot_missing(df_train)

In [None]:
plot_correlation(df_train)

In [None]:
plt.boxplot(df_train['bmi'])



*   Autoviz



In [None]:
af_after_cleaning = AV.AutoViz("",dfte=df_train,verbose=1, lowess=True)



---



## Encoding



*   Scaler



In [None]:
standard_transformer = Pipeline(steps=[
        ('standard', StandardScaler())])

robust_transformer = Pipeline(steps=[
        ('robust', RobustScaler())])

RS = RobustScaler()
SS = StandardScaler()
QT = QuantileTransformer(n_quantiles=10, random_state=0)
MMS = MinMaxScaler()

for col in numerical_column:
  if ((col == 'hypertension') or (col == 'heart_disease')):
    pass
  else:
    df_train[[col]] = RS.fit_transform(df_train[[col]])
    df_train[[col]] = QT.fit_transform(df_train[[col]])
    df_train[[col]] = MMS.fit_transform(df_train[[col]])
    df_train[[col]] = SS.fit_transform(df_train[[col]])

  # preprocessor = ColumnTransformer(
  #         remainder='passthrough', #passthough features not listed
  #         transformers=[
  #             ('robust', robust_transformer , [col]),
  #             ('standard', standard_transformer , [col])
  #         ])



1.   Technique 1: Using one-hot encoding to encode categorical columns





*   One-Hot Encoding



In [None]:
# df_train = pd.get_dummies(df_train, columns=category_column)
# for col in df_train.columns:
#   if df_train[col].dtype == np.uint8:
#     df_train[col] = df_train[col].astype(int)



2.   Technique 2: Using LabelEncoder() to encode categorical columns





*   Label Encoder



In [None]:
le = LabelEncoder()
for col in category_column:
  df_train[col] = le.fit_transform(df_train[col])

## EDA after Encoding

In [None]:
df_train.describe()

In [None]:
df_train.info()

In [None]:
report_after_encoding = create_report(df_train)
report_after_encoding

In [None]:
plot(df_train)

In [None]:
plot_missing(df_train)

In [None]:
plot_correlation(df_train)

In [None]:
plt.boxplot(df_train['bmi'])



*   Autoviz



In [None]:
af_after_encoding= AV.AutoViz("",dfte=df_train,verbose=1, lowess=True)



---



## Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_train, labels_train, test_size=0.2)
x_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
x_test.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [None]:
x_train

## All Classifier Testing


*   Different Classifier Testing

---



---







*   Classifier



In [None]:
KNC = KNeighborsClassifier()
XGB = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.4, n_estimators=100, max_depth=3)
BNB = BernoulliNB()
GNB = GaussianNB()
DTC = DecisionTreeClassifier()
ETC = ExtraTreeClassifier()
RFC = RandomForestClassifier()
ETsC = ExtraTreesClassifier()
GBC = GradientBoostingClassifier()


estimator = [('kneighbor', KNC),
             ('xgb', XGB),
             ('bernoulli', BNB),
             ('gaussian', GNB),
             ('decisiontree', DTC),
             ('extratree', ETC),
             ('randomforest', RFC),
             ('extratrees', ETsC),
             ('gradientboosting', GBC),
             ]


classifier = [KNeighborsClassifier(), 
              xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.4, n_estimators=100, max_depth=3),
              RidgeClassifier(), 
              SGDClassifier(), 
              PassiveAggressiveClassifier(),
              BernoulliNB(), 
              GaussianNB(),
              DecisionTreeClassifier(), 
              ExtraTreeClassifier(),
              RandomForestClassifier(),
              LinearSVC(max_iter=5000),
              SVC(),
              ExtraTreesClassifier(), 
              GradientBoostingClassifier()
              ]
ensemble = [
            VotingClassifier(estimators=estimator, voting='hard'),
            VotingClassifier(estimators=estimator, voting='soft'),
            AdaBoostClassifier(n_estimators=100),
            BaggingClassifier(n_estimators=100)           
]



---





*   Ensemble


In [None]:
fig , ax = plt.subplots(nrows = len(ensemble), ncols=1, figsize=(16,40))
for k in range(0,len(ensemble)):
  model_ensemble = ensemble[k]
  model_ensemble = model_ensemble.fit(x_train,y_train)
  kfold_ensemble = StratifiedKFold(n_splits=10)
  results_kfold_ensemble =cross_val_score(model_ensemble, x_train, y_train, cv=kfold_ensemble)
  print("Accuracy: %.2f%%" % (results_kfold_ensemble.mean()*100.0))
  prediction_ensemble = model_ensemble.predict(x_test)
  confusion_matrix_ensemble = pd.crosstab(y_test, prediction_ensemble, rownames=['Actual'], colnames=['Predicted'])
  sb.heatmap(confusion_matrix_ensemble, annot=True, cmap='PuBu', ax=ax[k])
  ax[k].set_title(model_ensemble)
  plt.tight_layout()
  print("{} Done".format(ensemble[k]))
  print('\n')



*   Normal checking



In [None]:
fig , axes = plt.subplots(nrows = len(classifier), ncols=1, figsize=(16,60))
for i,est in enumerate(classifier):
  print(est)
  model_normal = est
  model_normal = model_normal.fit(x_train, y_train)
  kfold_normal = StratifiedKFold(n_splits=10)
  results_kfold_normal =cross_val_score(model_normal, x_train, y_train, cv=kfold_normal)
  print("Accuracy: %.2f%%" % (results_kfold_normal.mean()*100.0))
  prediction_normal = model_normal.predict(x_test)
  confusion_matrix_normal = pd.crosstab(y_test, prediction_normal, rownames=['Actual'], colnames=['Predicted'])
  sb.heatmap(confusion_matrix_normal, annot=True, cmap='PuBu', ax=axes[i])
  axes[i].set_title(model_normal)
  plt.tight_layout()
  print('\n')



*   Pipeline


In [None]:
# for est in classifier:
#   print(est)
  
#   pipe = Pipeline([
#                  ('robust', RobustScaler()),
#                  ('standard', StandardScaler()),
#                  ('estimator', est)
#   ])
#   model_pipe = pipe.fit(x_train, y_train)
#   kfold_pipe = StratifiedKFold(n_splits=10)
#   results_kfold_pipe =cross_val_score(model_pipe, x_train, y_train, cv=kfold_pipe)
#   print("Accuracy: %.2f%%" % (results_kfold_pipe.mean()*100.0))
#   print('\n')



---



---



## Actual Classifier

In [None]:
# model = Pipeline([
#                  ('robust', RobustScaler()),
#                  ('standard', StandardScaler()),
#                  ('xgb', xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.4, n_estimators=100, max_depth=3))
# ])
model = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.4, n_estimators=100, max_depth=3)



*   Fit



In [None]:
model = model.fit(x_train, y_train)



*   K-Fold CV



In [None]:
kfold1 = StratifiedKFold(n_splits=10)
results_kfold =cross_val_score(model, x_train, y_train, cv=kfold1)
print("Accuracy: %.2f%%" % (results_kfold.mean()*100.0))



*   Predict



In [None]:
f2 = model.predict(x_test)

## Result

In [None]:
final = pd.DataFrame(columns=['stroke_actual','stroke_predicted'])
final['stroke_actual'] = y_test
final['stroke_predicted'] = f2
final.reset_index(inplace=True)
confusion_matrix = pd.crosstab(final['stroke_actual'], final['stroke_predicted'], rownames=['Actual'], colnames=['Predicted'])
sb.heatmap(confusion_matrix, annot=True, cmap='PuBu')



---



---



## Using EvalML library





*   Labels EvalML




In [None]:
labels_train_copy = df_train_copy.pop('stroke')
labels_train_copy



*   Split



In [None]:
x = infer_feature_types(df_train_copy)
x

In [None]:
X_train, X_holdout, y_train, y_holdout = split_data(df_train_copy, labels_train_copy, problem_type='binary', test_size=.2)

In [None]:
automl = AutoMLSearch(X_train=X_train, y_train=y_train,
                      problem_type='binary', objective='auto', max_batches=1)

In [None]:
automl.search()

In [None]:
automl.rankings

In [None]:
automl.describe_pipeline(3)

In [None]:
pipeline = automl.get_pipeline(3)
print(pipeline.parameters)

In [None]:
pipeline = automl.best_pipeline
pipeline.score(X_holdout, y_holdout, ["f1","auc",])

In [None]:
pipeline.graph()