In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Setting visualisation parameters
sns.set_style('darkgrid')
cmap = sns.cm.mako_r

%matplotlib inline

# Preventing warnings from libraries especially scikit learn
import warnings
warnings.filterwarnings('ignore')

In [None]:
stroke = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
stroke.head()

In [None]:
# Viewing the shape of the data in (row, column) format
stroke.shape

In [None]:
stroke.info()

Preprocessing Data before exploration

In [None]:
stroke.drop(columns=['id']).describe()

Using round() to round off age.
Setting values to NaN where BMI is less than 12 and greater than 60. We were told in the dataset that these values should be considered outliers and therefore should not be considered when building a model.
We will sort the dataframe based on gender and then on age and use forward filling to fill out those missing BMI values.

In [None]:
# Round off age values
stroke['age'] = stroke['age'].apply(lambda x : round(x))

# BMI to NaN
stroke['bmi'] = stroke['bmi'].apply(lambda bmi_value: bmi_value if 12 < bmi_value < 60 else np.nan)

# Sorting dataframe based on gender then on age and using forward fill-ffill() to fill NaN value for BMI
stroke.sort_values(['gender', 'age'], inplace = True)
stroke.reset_index(drop=True, inplace=True)
stroke['bmi'].ffill(inplace=True)

In [None]:
stroke.info()

We have now converted our age column to int64 and have no missing values in our bmi column

# Exploratory data analysis
* Check if the data is balanced
* Plotting various graphs to check for any relation between each column

* Age vs BMI
* BMI vs AVG glucose level
* Percentage of people who had a stroke in each category

In [None]:
stroke.corr()

In [None]:
plt.figure(figsize=(12,10))
cor=stroke.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
number_of_strokes = stroke.groupby('age').sum()
whole_population = stroke.groupby('age').count()
percentage_had_a_stroke = (number_of_strokes['stroke']/whole_population['stroke']) * 100

plt.figure(figsize=(12, 8))
plt.plot(percentage_had_a_stroke.index, percentage_had_a_stroke.values)
plt.title('Distribution of strokes based on age', fontsize=25)
plt.xlabel('Age', fontsize=15)
plt.ylabel('Percentage of people that had a stroke', fontsize=15)

plt.show()

In [None]:
# Checking if the data is balanced
xs = stroke['stroke'].value_counts().index
ys = stroke['stroke'].value_counts().values

ax = sns.barplot(xs, ys)
ax.set_xlabel("Stroke")
plt.show()

As we can see from the plot, the data is not balanced, this will result in a badly fitted model. 

In [None]:
# Age vs BMI
plt.figure(figsize = (12,8))
ax = sns.scatterplot(x="bmi", y="age", alpha=0.4, data=stroke[stroke['stroke']==0])
sns.scatterplot(x = "bmi", y="age", alpha=1, data=stroke[stroke['stroke']==1], ax=ax)
plt.show()

From the above Age vs BMI plot we can clearly see that when people attain an age of 40 or greater, the chances of having a stroke increases. After age 60, it tends to increase even more. Furthermore, people with a BMI of over 20-25 have shown a greatly increased chance of having a stroke.

So, from this plot we can conclude that people who are aged over 40 and have a BMI of over 20-25 have a grater probability of having a stroke.

In [None]:
# AVG Glucose level vs BMI with hue = stroke
plt.figure(figsize = (12,8))
ax = sns.scatterplot(x="bmi", y="avg_glucose_level", alpha=0.4, data=stroke[stroke['stroke']==0])
sns.scatterplot(x="bmi", y="avg_glucose_level", alpha=1, data=stroke[stroke['stroke']==1], ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(4,2, figsize = (12,12))
((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = ax

labels = stroke['gender'].value_counts().index.tolist()[:2]
values = stroke['gender'].value_counts().tolist()[:2]
ax1.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.05])
ax1.set_title("Gender Distribution Pie Chart", fontdict={'fontsize': 14})

labels = ["Not hypertension", "hypertension"]
values = stroke['hypertension'].value_counts().tolist()
ax2.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.2])
ax2.set_title("Hypertension Distribution Pie Chart", fontdict={'fontsize': 14})

labels = ["There is not heart disease", "There is heart disease"]
values = stroke['heart_disease'].value_counts().tolist()
ax3.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.2])
ax3.set_title("Heart disease Distribution Pie Chart", fontdict={'fontsize': 14})

labels = ["married", "never married"]
values = stroke['ever_married'].value_counts().tolist()
ax4.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.05])
ax4.set_title("Marriage Distribution Pie Chart", fontdict={'fontsize': 14})

labels = ["Private Job", "Self-employed", "Children", "Goverment Job", "Never Worked Before"]
values = stroke['work_type'].value_counts().tolist()
ax5.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0.1, 0.1, 0.1, 0.1, 0.2])
ax5.set_title("Work Type Pie Chart", fontdict={'fontsize': 14})

labels = ["Urban Residence", "Rural Residence"]
values = stroke['Residence_type'].value_counts().tolist()
ax6.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.05])
ax6.set_title("Residence Type Pie Chart", fontdict={'fontsize': 14})

labels = ["Never Smoked Before", "Unknown", "Smoked in the past", "Currently Smokes"]
values = stroke['smoking_status'].value_counts().tolist()
ax7.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0.03, 0.03, 0.03, 0.03])
ax7.set_title("Smoking Status Pie Chart", fontdict={'fontsize': 14})

labels = ["Didn't have Stroke", "Had Stroke"]
values = stroke['stroke'].value_counts().tolist()
ax8.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.2])
ax8.set_title("Stroke Pie Chart", fontdict={'fontsize': 14})

plt.tight_layout()
plt.show()

In [None]:
# Percentage of people
def plot_percent_of_stroke_in_each_category(df, column, axis):
    x_axis = []
    y_axis = []
    
    unique_values = df[column].unique()
    
    for value in unique_values:
        stroke_yes = len(df[(df[column] == value) & (df['stroke'] ==1)])
        total = len(df[df[column] == value])
        percentage = (stroke_yes/total) * 100
        x_axis.append(value)
        y_axis.append(percentage)
        
    sns.barplot(x_axis, y_axis, ax=axis)
    
columns = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

fig, axes = plt.subplots(4, 2, figsize=(16,18))
axes[3, 1].remove()

plot_percent_of_stroke_in_each_category(stroke, 'gender', axes[0,0])
axes[0,0].set_xlabel("Gender")
axes[0,0].set_ylabel("Percentage")

plot_percent_of_stroke_in_each_category(stroke, 'hypertension', axes[0,1])
axes[0,1].set_xlabel("Hypertension")

plot_percent_of_stroke_in_each_category(stroke, 'heart_disease', axes[1,0])
axes[1,0].set_xlabel("Heart Disease")
axes[1,0].set_ylabel("Percentage")

plot_percent_of_stroke_in_each_category(stroke, 'ever_married', axes[1,1])
axes[1,1].set_xlabel("Ever Married")


plot_percent_of_stroke_in_each_category(stroke, 'work_type', axes[2,0])
axes[2,0].set_xlabel("Work Type")
axes[2,0].set_ylabel("Percentage")

plot_percent_of_stroke_in_each_category(stroke, 'Residence_type', axes[2,1])
axes[2,1].set_xlabel("Residence Type")

plot_percent_of_stroke_in_each_category(stroke, 'smoking_status', axes[3,0])
axes[3,0].set_xlabel("Smoking Status")
axes[3,0].set_ylabel("Percentage")

plt.show()

Insights drawn from above plots

* Both genders have around a 5% chance
* People with a history of hypertension and heart disease have shown an increased percentage of encountering a stroke with around a 12.5% chance and 16.5% chance respectively.
* Married/Divorced people have a 6.5% chance of a stroke.
* Self Employed people have a higher chance compared to private and government jobs. Stress induced?
* Rural and urban residency doesn't seem to show much of a difference.
* Former smokers have higher chance compared to people who have never smoked or currently smoke.

# Preparing the data for prediction

Converting the categorical columns into numerical by mapping each category to an integer value using map() on pandas series object
As we saw earlier, the data is imbalanced. To make it balanced we will use a technique called SMOTE (Synthetic minority oversampling technique). There are other techniques available such as NearMiss algorithm.
Splitting the data into training and testing samples.

In [None]:
# Converting categorical data to numerical

gender_dict = {'Male': 0, 'Female': 1, 'Other': 2}
ever_married_dict = {'No': 0, 'Yes': 1}
work_type_dict = {'children': 0, 'Never_worked': 1, 'Govt_job': 2, 'Private': 3, 'Self-employed': 4}
residence_type_dict = {'Rural': 0, 'Urban': 1}
smoking_status_dict = {'Unknown': 0, 'never smoked': 1, 'formerly smoked':2, 'smokes': 3}

stroke['gender'] = stroke['gender'].map(gender_dict)
stroke['ever_married'] = stroke['ever_married'].map(ever_married_dict)
stroke['work_type'] = stroke['work_type'].map(work_type_dict)
stroke['Residence_type'] = stroke['Residence_type'].map(residence_type_dict)
stroke['smoking_status'] = stroke['smoking_status'].map(smoking_status_dict)

In [None]:
# Splitting into features and value to be predicted
X = stroke.drop(columns=['id', 'stroke'])
y = stroke['stroke']

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)

sns.barplot(x=['0', '1'], y =[sum(y == 0), sum(y == 1)], ax = ax1)
ax1.set_title("Before Oversampling")
ax1.set_xlabel('Stroke')

#Using SMOTE to balance the Data
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 2) 
X, y = sm.fit_resample(X, y) 

sns.barplot(x=['0', '1'], y =[sum(y == 0), sum(y == 1)], ax = ax2)
ax2.set_title("After Oversampling")
ax2.set_xlabel('Stroke')

plt.tight_layout()
plt.show()

In [None]:
# Splitting data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

In [None]:
# Importing neccessary libraries
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix


In [None]:
lg = LogisticRegression()
lg.fit(X_train, y_train)
lg_predictions = lg.predict(X_test)

print(f"Accuracy Score : {round(accuracy_score(y_test, lg_predictions) * 100, 2)}%")

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, lg_predictions)
print(report)

In [None]:
dt = DecisionTreeClassifier(random_state=101)
dt.fit(X_train, y_train)
predictions = dt.predict(X_test)

print(f"Accuracy Score : {round(accuracy_score(y_test, predictions) * 100, 2)}%")

report = classification_report(y_test, predictions)
print(report)

In [None]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier())
pipeline.fit(X_train, y_train)
prediction = pipeline.predict(X_test)

print(f"Accuracy Score : {round(accuracy_score(y_test, prediction) * 100, 2)}%")

In [None]:
report = classification_report(y_test, prediction)
print(report)

In [None]:
tester_x=[[0,67,0,1,1,3,1,228.69,36.6,2]]

In [None]:
pipeline.predict(tester_x)

In [None]:
test_data=pd.read_csv("../input/data-test1/test_data1.csv")
test_data

In [None]:
gender_dict = {'Male': 0, 'Female': 1, 'Other': 2}
ever_married_dict = {'No': 0, 'Yes': 1}
work_type_dict = {'children': 0, 'Never_worked': 1, 'Govt_job': 2, 'Private': 3, 'Self-employed': 4}
residence_type_dict = {'Rural': 0, 'Urban': 1}
smoking_status_dict = {'Unknown': 0, 'never smoked': 1, 'formerly smoked':2, 'smokes': 3}


test_data['gender'] = test_data['gender'].map(gender_dict)
test_data['ever_married'] = test_data['ever_married'].map(ever_married_dict)
test_data['work_type'] = test_data['work_type'].map(work_type_dict)
test_data['Residence_type'] = test_data['Residence_type'].map(residence_type_dict)
test_data['smoking_status'] = test_data['smoking_status'].map(smoking_status_dict)

In [None]:
test_data

In [None]:
predicted_stroke= pipeline.predict(test_data)
predicted_stroke

In [None]:
final_DF=pd.DataFrame({'Predicted Stroke':predicted_stroke})


final_DF.to_csv("Predicted_Stroke.csv")