In [None]:
#################################
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import pandas_profiling as pdp
import gc
gc.enable()
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import scikitplot as skplt
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

# EDA

In [None]:
data.head()

In [None]:
data.describe().T

In [None]:
data.isna().sum()

In [None]:
data.info()

## Visualization

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18, 14))
sns.histplot(data=data, x="age", kde=True,ax=axes[0][0])
sns.histplot(data=data, x="hypertension", kde=True,ax=axes[0][1])
sns.histplot(data=data, x="heart_disease", kde=True,ax=axes[1][0])
sns.histplot(data=data, x="avg_glucose_level", kde=True,ax=axes[1][1])
sns.histplot(data=data, x="bmi", kde=True,ax=axes[2][0])
sns.histplot(data=data, x="stroke", kde=True,ax=axes[2][1])

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18, 14))
sns.countplot(data=data,palette="Set3", x="gender",ax=axes[0][0])
sns.countplot(data=data,palette="Set3", x="ever_married",ax=axes[0][1])
sns.countplot(data=data,palette="Set3", x="work_type",ax=axes[1][0])
sns.countplot(data=data,palette="Set3", x="Residence_type",ax=axes[1][1])
sns.countplot(data=data,palette="Set3", x="smoking_status",ax=axes[2][0])
sns.histplot(data=data, x="stroke",hue="gender", kde=True,ax=axes[2][1])

## Pandas profiling

In [None]:
report = pdp.ProfileReport(data, title="Stroke Report",minimal=True)
report

In [None]:
labels =data['stroke'].value_counts(sort = True).index
sizes = data['stroke'].value_counts(sort = True)
colors = ["lightblue","pink"]
plt.figure(figsize=(7,7))
plt.pie(sizes,labels=labels, colors=colors, autopct='%1.1f%%', startangle=90,)
plt.title('Stroke pie')
plt.show()

In [None]:
g = sns.pairplot(data, diag_kind="kde",hue="stroke")
g.map_lower(sns.kdeplot, levels=4, color=".2")

In [None]:
data.info()

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(),annot=True, cmap="YlGnBu")

# Preprocessing

### column identification with a single value.

In [None]:
for i in range(data.shape[1]):
    print(i,len(pd.unique(data.iloc[:,i])))

### identification of columns with few values.

In [None]:
for i in range(data.shape[1]):
    num=len(pd.unique(data.iloc[:,i]))
    porcentaje=float(num)/data.shape[0]*100
    print("%d, %d, %.1f%%"%(i,num,porcentaje))

### duplicate detection.

In [None]:
duplicado = data.duplicated()
print(duplicado.any())
print(data[duplicado])

In [None]:
data.isnull().sum()

### Imputation of Missing Values.

In [None]:
data1 = data.copy()

In [None]:
X= data1[["bmi","avg_glucose_level"]]
y=data1["stroke"]
X.head()

### theory of statistical imputation.

In [None]:
tree = DecisionTreeRegressor()
resultado = []
tipos = ['mean', 'median', 'most_frequent', 'constant']

for t in tipos:
    imputer=SimpleImputer(strategy=t)
    imputer.fit(X)
    X_trans= imputer.transform(X)
    tree = DecisionTreeRegressor(max_depth=10,random_state=42)
    tree.fit(X_trans,y)
    y_pred = tree.predict(X_trans)
    r2 = metrics.r2_score(y, y_pred)
    RMSE=np.sqrt(mean_squared_error(y,y_pred))
    resultado.append(r2)
    print("La estrategia utilizada--->",t)
    print("RMSE segun el tipo de estrategia:",RMSE)
    print("----------------------------------------")

### Theory of imputation with KNN.

In [None]:
vecinos = [1,3,6,9,12]
for v in vecinos:
    KNN_imputer=KNNImputer(n_neighbors=v)
    KNN_imputer.fit(X)
    X_knn= KNN_imputer.transform(X)
    tree = DecisionTreeRegressor(max_depth=10,random_state=42)
    tree.fit(X_knn,y)
    y_pred = tree.predict(X_knn)
    r2 = metrics.r2_score(y, y_pred)
    RMSE=np.sqrt(mean_squared_error(y,y_pred))
    resultado.append(r2)
    print("La cantidad de vecinos utilizados--->",v)
    print("RMSE segun el tipo de estrategia:",RMSE)
    print("----------------------------------------")

### Theory of iterative imputation.

In [None]:
tipos2 = ['ascending', 'descending', 'roman', 'arabic', 'random']
for t in tipos2:
    iter_imputer=IterativeImputer(imputation_order=t)
    iter_imputer.fit(X)
    X_iter= iter_imputer.transform(X)
    tree = DecisionTreeRegressor(max_depth=10,random_state=42)
    tree.fit(X_iter,y)
    y_pred = tree.predict(X_iter)
    r2 = metrics.r2_score(y, y_pred)
    RMSE=np.sqrt(mean_squared_error(y,y_pred))
    resultado.append(r2)
    print("El metodo utilizado es--->",t)
    print("RMSE segun el tipo de estrategia:",RMSE)
    print("----------------------------------------")

### theory of statistical imputation with strategy= "most_frequent".

In [None]:
imputer=SimpleImputer(strategy="most_frequent")
imputer=imputer.fit(data.iloc[:,9:10])
data.iloc[:,9:10]=imputer.transform(data.iloc[:,9:10])

In [None]:
data.isnull().sum()

### Outliers

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["age"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["bmi"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["avg_glucose_level"],color='lightblue')

In [None]:
from scipy import stats
data=data[(np.abs(stats.zscore(data["age"])) < 3)]
data=data[(np.abs(stats.zscore(data["bmi"])) < 3)]
data=data[(np.abs(stats.zscore(data["avg_glucose_level"])) < 3)]

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["age"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["bmi"],color='lightblue')

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(data=data,x=data["avg_glucose_level"],color='lightblue')

### Encoding.

In [None]:
data["gender"] = data["gender"].apply(lambda x: 1 if x=="Male" else 0)
data["ever_married"] = data["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)
data["Residence_type"] = data["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
data = pd.get_dummies(data=data, columns=['smoking_status'])
data = pd.get_dummies(data=data, columns=['work_type'])

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(),annot=True, cmap="YlGnBu")

### Scaling.

In [None]:
robust_sc = preprocessing.RobustScaler()
standard_sc = preprocessing.StandardScaler() 
minmax_sc = preprocessing.MinMaxScaler() 

In [None]:
X = data.drop(["stroke","id"],axis=1)
y = data["stroke"]

In [None]:
for x in [robust_sc,standard_sc,minmax_sc]:
    scaler = x.fit(X)
    X_new = x.transform(X)
    tree = DecisionTreeRegressor(max_depth=10,random_state=42)
    tree.fit(X_new,y)
    y_pred = tree.predict(X_new)
    r2 = metrics.r2_score(y, y_pred)
    RMSE=np.sqrt(mean_squared_error(y,y_pred))
    resultado.append(r2)
    print("El escalado Utilizado--->",x)
    print("RMSE segun el tipo de estrategia:",RMSE)
    print("----------------------------------------")

### Oversampling

In [None]:
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [None]:
X, y = pipeline.fit_resample(X, y)

In [None]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
X_train=standard_sc.fit_transform(X_train)
X_test=standard_sc.transform(X_test)

# Machine Learning

### XGB classifier

In [None]:
gbm = XGBClassifier(verbosity=1)
params_xgb = {
        "n_estimators":[500,1000,1500],
        "learning_rate":[0.1,0.3,0.6],
        'gpu_id': [0],
        "predictor":["gpu_predictor"],
        'tree_method': ['gpu_hist'],
        "updater":["grow_gpu_hist"],
        "sampling_method":["gradient_based"],
        "updater":["grow_gpu_hist"]
}

In [None]:
model_xgb = GridSearchCV(gbm,param_grid=params_xgb, cv=3,n_jobs=-1)
model_xgb.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_xgb.best_params_))
print("Best Score: "+str(model_xgb.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_xgb.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_xgb = model_xgb.predict(X_train)
y_test_pred_xgb = model_xgb.predict(X_test)

rmse_train_xgb = np.sqrt(mean_squared_error(y_train, y_train_pred_xgb))
rmse_test_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))

print("RMSE en Train es:",rmse_train_xgb)
print("RMSE en Test es:",rmse_test_xgb)

In [None]:
print("Accuracy train XGB: ", accuracy_score(y_train, y_train_pred_xgb))
print("Accuracy test XGB: ", accuracy_score(y_test, y_test_pred_xgb))

### MLP Classifier

In [None]:
clf = MLPClassifier(random_state=42)
params_MLP = {
        "hidden_layer_sizes":[64,128,256],
        "activation":["identity", "logistic", "tanh", "relu"],
        'solver': ["lbfgs", "sgd", "adam"],
        "learning_rate":["constant", "invscaling", "adaptive"],
        'max_iter': [100,200],
        "warm_start":[True]
}

In [None]:
model_MLP = GridSearchCV(clf,param_grid=params_MLP, cv=3,n_jobs=-1)
model_MLP.fit(X_train,y_train)

In [None]:
print("Best params: "+str(model_MLP.best_params_))
print("Best Score: "+str(model_MLP.best_score_)+'\n')

In [None]:
scores = pd.DataFrame(model_MLP.cv_results_)
scores.sort_values(by="rank_test_score")

In [None]:
y_train_pred_MLP = model_MLP.predict(X_train)
y_test_pred_MLP = model_MLP.predict(X_test)

rmse_train_MLP = np.sqrt(mean_squared_error(y_train, y_train_pred_MLP))
rmse_test_MLP = np.sqrt(mean_squared_error(y_test, y_test_pred_MLP))

print("RMSE en Train es:",rmse_train_MLP)
print("RMSE en Test es:",rmse_test_MLP)

In [None]:
print("Accuracy train MLP: ", accuracy_score(y_train, y_train_pred_MLP))
print("Accuracy test MLP: ", accuracy_score(y_test, y_test_pred_MLP))