In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
dataset.head()

In [None]:
dataset.shape

Initial observations from the data and ToDo list:

* 5110 Items 12 labes (Stroke-Dependent Variable)
* We need to handle categorical variables
* Check for imbalance data
* Handle Null values
* Scaling (Maybe-Maybe not)
* Feature selection



# Exploratory Data Analysis

In [None]:
dataset.describe()

dataset = dataset.drop(['id'],axis=1)

### Missing values

In [None]:
features_with_na = [features for features in dataset.columns if dataset[features].isnull().sum()>1]

for feature in features_with_na:
  print(f"{feature} has {100*(np.round(dataset[feature].isnull().mean(),4))} % missing values")

In [None]:
correlation_with_features_with_na = dataset.corr()
print(correlation_with_features_with_na)

As we can see the correlation between Stroke and Bmi is very less. I can not use it but we'll see further.

### Numerical Values

In [None]:
numerical_features = [features for features in dataset.columns if dataset[features].dtype != 'O']

print(f"Numerical features {len(numerical_features)}")
dataset[numerical_features].head()

In [None]:
discrete_features = [features for features in numerical_features if dataset[features].dtype == 'int']

print(f'There are {len(discrete_features)} discrete variables present ')

### Continous variables

In [None]:
continous_features = [features for features in numerical_features if dataset[features].dtype == 'float' ]

print(continous_features)
print(f'There are {len(continous_features)} features present')

In [None]:
for features in continous_features:
  data = dataset.copy()
  data[features].hist(bins=25)
  plt.xlabel(features)
  plt.ylabel('Stroke')
  plt.show()

Performing Log transformation on bmi and avg_gucose_level

In [None]:
for features in ['bmi','avg_glucose_level']:
  data = dataset.copy()
  data[features]=np.log(data[features])
  data[features].hist(bins=25)
  plt.xlabel(features)
  plt.ylabel('Stroke')
  plt.show()

Finding outliers in the data

In [None]:
for feature in continous_features:
  data = dataset.copy()
  if 0 in data[features].unique():
    pass
  else:
    data[feature] = np.log(data[feature])
    data.boxplot(column=feature)
    plt.ylabel(feature)
    plt.title(feature)
    plt.show()


### Categorical features

In [None]:
categorical_features = [features for features in dataset.columns if data[features].dtypes == 'O']
print(categorical_features)

Cardinality of categorical variables

In [None]:
for features in categorical_features:
  print(f"Feature {features} number of categories {len(dataset[features].unique())}")

We can one hot encode them all but before that we should find the relation between them and stroke

In [None]:
for features in categorical_features:
  data = dataset.copy()
  data.groupby(features)['stroke'].count().plot.bar()
  plt.xlabel(features)
  plt.ylabel('Stroke')
  plt.title(features)
  plt.show()



# Feature Engineering

### Creating a train test split before any preprocessing to prevent any data leakage

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset,dataset['stroke'],test_size = 0.1, random_state=0)

### Handling missing values

In [None]:
features_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1]

print(features_nan)

Replacing missing values with median

In [None]:
for features in ['bmi']:
  median_value = dataset[features].median()

  #dataset[features+'nan']=np.where(dataset[features].isnull(),1,0)
  dataset[features].fillna(median_value,inplace=True)

dataset['bmi'].isnull().sum()

In [None]:
median_value_train = X_train['bmi'].median()
median_value_test = X_test['bmi'].median()

X_train['bmi'].fillna(median_value_train, inplace=True)
X_test['bmi'].fillna(median_value_test, inplace=True)


In [None]:
X_train.head()

In [None]:
X_train = X_train.drop(['stroke'],axis=1)
X_test = X_test.drop(['stroke'],axis=1)

### Handling skewed data

In [None]:
for features in ['bmi','avg_glucose_level']:
  X_train[features] = np.log(X_train[features])
  X_test[features] = np.log(X_test[features])

In [None]:
X_train.shape

## Label encoder

In [None]:
categorical_variables_to = [feature for feature in dataset.columns if dataset[feature].dtype == 'O']

In [None]:
label_X_train = X_train.copy()
label_X_test = X_test.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder= LabelEncoder()

In [None]:
for cols in categorical_variables_to:
  label_X_train[cols] = label_encoder.fit_transform(X_train[cols])
  label_X_test[cols] = label_encoder.transform(X_test[cols])

## One Hot encoder second

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0,4,5,6,9])], remainder='passthrough')
oh2_x_train = pd.DataFrame(ct.fit_transform(X_train))
oh2_x_test = pd.DataFrame(ct.transform(X_test))

In [None]:
oh2_x_train.head()

In [None]:
oh2_x_train.index = X_train.index
oh2_x_test.index = X_test.index

# Modeling Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
accur = []
pred_as_1 =[]
for i in range(2,51):
    model = DecisionTreeClassifier(max_depth= i).fit(oh2_x_train, y_train)
    pred = model.predict(oh2_x_test)
    accur.append(accuracy_score(y_test, pred))
    pred_as_1.append(confusion_matrix(y_test, pred)[1][1])

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,4))
x=list(range(2,51))
axes[0].plot(x,accur,'r')
axes[1].plot(x,pred_as_1,'b')
plt.show()


In [None]:
model = DecisionTreeClassifier(max_depth= 30).fit(oh2_x_train, y_train)
pred = model.predict(oh2_x_test)

confusion_matrix(y_test, pred)

In [None]:
print(classification_report(y_test,pred))