### Stroke Prediction

#### 1) Import our tools

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Model evaluations
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, classification_report

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
df= pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

df.tail()

In [None]:
df.dtypes

### From the dataset

   * We have 12 columns with 5109 rows
    
   * We have multiple data types and `NaN` values

### Data cleaning

In [None]:
print("Nan Values: " + str(df.isna().sum()))

In [None]:
avg= df["bmi"].median()
print("Average:" + str(avg))

In [None]:
# Fill the `NaN` values with the average

df["bmi"]= df["bmi"].fillna(avg)

### EDA

In [None]:
df.head()

In [None]:
df["gender"].value_counts().plot(kind="bar", color=["red", "blue"],
                                 xlabel= "Gender", ylabel="Value_Counts")

In [None]:
df["stroke"].value_counts().plot(kind="bar",
                                 color=["blue", "red"])

In [None]:
pd.crosstab(df["gender"], df["stroke"]).plot(
                                             kind="bar", 
                                             color=["orange", "blue"])

plt.legend(["not stroke", "stroke"])

In [None]:
df["age"].hist()

In [None]:
pd.crosstab(df["age"], df["stroke"]).hist()

In [None]:
pd.crosstab(df["hypertension"], df["stroke"]).plot(kind="bar")

plt.legend(["Not stroke", "Stroke"])

plt.xlabel("Hypertension 0-> Not 1-> Yes")



In [None]:
pd.crosstab(df["heart_disease"], df["stroke"]).plot(kind="bar")

plt.legend(["Not stroke", "stroke"])

plt.xlabel("Heart Disease (0->No, 1-> Yes)")

In [None]:
df["work_type"].value_counts().plot(kind="pie",
                                    autopct="%1.1f%%",
                                    figsize=(10, 10))

In [None]:
df["ever_married"].value_counts().plot(kind="pie",
                                       autopct="%1.1f%%",
                                       figsize=(10, 6))

In [None]:
pd.crosstab(df["work_type"], df["stroke"]).plot(kind="bar",
                                                figsize=(10, 6))

plt.legend(["Not stroke", "stroke"])

In [None]:
pd.crosstab(df["Residence_type"], df["work_type"]).plot(kind="bar",
                                                                  figsize=(10, 6))

In [None]:
pd.crosstab(df["smoking_status"], df["stroke"]).plot(kind="bar", 
                                                    figsize=(10, 6))

In [None]:
df["avg_glucose_level"].plot(kind="hist",
                             figsize=(10, 6))

### Transform the data types

#### Categorical Features Transformation

In [None]:
df.dtypes

In [None]:
print("gender values: " + str(df["gender"].unique()))

print("ever_married unique values: " + str(df["ever_married"].unique()))

print("work_type types: " +str(df["work_type"].unique()))

print("Residence_type values: " +str(df["Residence_type"].unique()))

print("smoking_status values: " + str(df["smoking_status"].unique()))

In [None]:
genderMap= {"Male": 0, 
            "Female": 1,
            "Other": 2}

ever_marriedMap= {"Yes": 0,
                "No": 1}

work_typeMap= {"Private": 0,
               "Self-employed": 1,
               "Govt_job": 2,
               "children": 3,
               "Never_worked": 4}

Residence_typeMap= {"Urban": 0,
                    "Rural": 1}

smoking_statusMap= {"formerly smoked": 0,
                 "never smoked": 1,
                 "smokes": 2,
                 "Unknown": 3}

In [None]:
def changeValues(col, colMap):
    
    df[col] = df[col].map(colMap).astype(int)
        
    return "Done with " + str(col)

In [None]:
changeValues(col= "gender", colMap = genderMap)

changeValues(col= "ever_married", colMap= ever_marriedMap)

changeValues(col= "work_type", colMap= work_typeMap)

changeValues(col= "Residence_type", colMap= Residence_typeMap)

changeValues(col= "smoking_status", colMap= smoking_statusMap)

In [None]:
df.head()

### Numerical Features Transformation

In [None]:
# age -> int
# glucose, bmi -> round int

df["age"]= df["age"].astype(int)

df["avg_glucose_level"]= round(df["avg_glucose_level"]).astype(int)

df["bmi"]= round(df["bmi"]).astype(int)

### Model Creation

In [None]:
df.dtypes

In [None]:
# Split the dataframe to `80%` train `20%` test

splitNumber= int(5110 * (0.7))

df_train = df.iloc[:splitNumber, :]
df_test= df.iloc[(splitNumber + 1):, :]

X = df_train.drop(["stroke"], axis=1)
y= df_train["stroke"]

X_train, X_test, y_train, y_test= train_test_split(X, 
                                                   y,
                                                   random_state=42)

# See the shapes
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
models= {"RandomForestClassifier": RandomForestClassifier(),
         "DecisionTreeClassifier": DecisionTreeClassifier(),
         "KNeighborsClassifier" : KNeighborsClassifier()}

def fitNScore(models, X_train, X_test, y_train, y_test):
    
    np.random.seed(42)
    scores={}
    
    for name, model in models.items():
        
        model.fit(X_train, y_train)
        scores[name]= model.score(X_test, y_test)
        
    return scores

In [None]:
fitNScore(models= models, X_train= X_train,
          y_train= y_train, X_test= X_test,
          y_test= y_test)

In [None]:
mod= RandomForestClassifier(n_jobs=3,
                            random_state=42,
                            criterion="entropy")

mod.fit(X_train, y_train)

preds= mod.predict(X_test)

print(classification_report(y_test, preds))

In [None]:
rfGrid= {"n_estimators": np.arange(10, 100, 1000),
         "max_depth": [6, 8, 10],
         "min_samples_leaf": [4, 6, 8],
         "max_features": ["auto", "sqrt", "log2"]}

rsModel= RandomizedSearchCV(mod, 
                            param_distributions= rfGrid,
                            cv=5
                            )

rsModel.fit(X_train, y_train)

In [None]:
preds= rsModel.predict(X_test)

print(classification_report(y_test, preds))