In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### First let us import all the required packages for our analysis and create an Import class for calling it.

In [None]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from matplotlib import pyplot as plt
from datetime import datetime
from glob import glob
    
import logging
logger = logging.getLogger('simple_example')
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.info("Completed importing all the required packages")

In [None]:
os.chdir('/kaggle/input/stroke-prediction-dataset')
files=glob(os.getcwd()+"/*.csv")
for file in files:
    input_data=pd.read_csv(file)

In [None]:
input_data.head()

### There are 11 independant variables and 1 dependant variable in our analysis.

### Out of the independant variables age,avg_gluscose_level and bmi are continuous numerical variables and the remaining variables like gender, hypertension (1 : Yes, 0 : No), heart_disease (1 : Yes, 0 : No), ever_married, work_type, residence_type, smoking_status are discrete categorical variables.

### NOTE : id won't add much significance to our study so we would remove it from our analysis

### The dependent variable stroke is a discrete categorical variable     (1 : Yes, 0 : No)


In [None]:
def drop_variable(df,columns):
    for col in columns:
        df.drop(columns=col,axis=1,inplace=True)
        return df

In [None]:
input_data=drop_variable(input_data,["id"])

In [None]:
def get_na_perc(df):
    print(np.round(df.isna().sum() / df.shape[0] * 100.0,2))

In [None]:
get_na_perc(input_data)

### Only 3.93 % of bmi values are missing from our analysis. We would take 3 approaches to tackle this

#### 1) Imput bmi with its mean value
#### 2) Remove the rows having missing bmi from our analysis
#### 3) Remove the bmi column altogether

### Imputing bmi with mean value

In [None]:
def impute_with_mean(df,columns):
    for col in columns:
        df[col]=df[col].fillna(df[col].mean())
        return df

In [None]:
input_data=impute_with_mean(input_data,["bmi"])

### No let us check whether the missing values of bmi are imputed properly or not

In [None]:
get_na_perc(input_data)

### Let us perform one hot encoding for the categorical valriables

In [None]:
def perform_one_hot_encode(df,cols):
    df=pd.get_dummies(df,columns=cols,prefix_sep="_")
    return df

In [None]:
input_data=perform_one_hot_encode(input_data,["gender","hypertension","heart_disease","ever_married","work_type","Residence_type","smoking_status"])

In [None]:
input_data.head()

### Now let us normalize the numerical variables so that they are all on the same scale

In [None]:
from sklearn.preprocessing import StandardScaler
def standardise_numerical_variables(df,columns):
    for col in columns:
        x=np.array(df[col]).reshape(-1,1)
        scaler=StandardScaler()
        df[col]=scaler.fit_transform(x)
    return df

In [None]:
input_data=standardise_numerical_variables(input_data,["age","avg_glucose_level","bmi"])

### Let us check for the distribution of stroke patients in our analysis

In [None]:
input_data["stroke"].value_counts()

### Most of the patients in our analysis have stroke : 0 which signify that they have not suffered from stroke. So we must use SMOTETomek to account for this imbalance in the distribution of our dependant variable

In [None]:
y=input_data["stroke"]
X=input_data.drop(columns=["stroke"],axis=1)

In [None]:
from imblearn.combine import SMOTETomek
tomek=SMOTETomek()

X_res,y_res=tomek.fit_resample(X,y)

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,test_size=0.25,random_state=0)

### Hyperparameter tuning using GridSearchCV technique

#### Declaring the hyperparameters

In [None]:
# Number of trees in Random Forest
rf_n_estimators = [int(x) for x in np.linspace(100, 1000, 5)]

# Maximum number of levels in tree
rf_max_depth = [int(x) for x in np.linspace(5, 55, 11)]
# Add the default as a possible value
rf_max_depth.append(None)

# Number of features to consider at every split
rf_max_features = ['auto', 'sqrt', 'log2']

# Criterion to split on
rf_criterion = ['gini','entropy']

# Minimum number of samples required to split a node
rf_min_samples_split = [int(x) for x in np.linspace(2, 10, 9)]

# Minimum decrease in impurity required for split to happen
rf_min_impurity_decrease = [0.0, 0.05, 0.1]

# Method of selecting samples for training each tree
rf_bootstrap = [True, False]

# Create the grid
rf_grid = {'n_estimators': rf_n_estimators,
               'max_depth': rf_max_depth,
               'max_features': rf_max_features,
               'criterion': rf_criterion,
               'min_samples_split': rf_min_samples_split,
               'min_impurity_decrease': rf_min_impurity_decrease,
               'bootstrap': rf_bootstrap}

#### Getting the value of the best hyperparameters

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_base = RandomForestClassifier()

# Create the random search Random Forest
rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = rf_grid, 
                               n_iter = 10, cv = 10, verbose = 2, random_state = 42, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_res, y_res)

# View the best parameters from the random search
rf_random.best_params_

### Performance Metrics after Hyperparameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators= 775,
 min_samples_split= 6,
 min_impurity_decrease= 0.0,
 max_features= 'auto',
 max_depth= 25,
 criterion= 'gini',
 bootstrap= False)

forest.fit(X_train,y_train)

y_pred=forest.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))