# 1. Setting up the environment.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
print('Priyatama is ready!')

# 2. Reading the dataset.

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

# 3. Exploratory data analysis.

## 3.1 Check for and deal with NA values.

In [None]:
sns.heatmap(df.isnull())

In [None]:
a = (df.isnull().sum())
print(a[a>0])
df.shape

In [None]:
100*201/5110

### Around 4% of data points for BMI are blank, we can drop it.

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.drop('id', axis=1,inplace=True)

## 3.2 Check and deal with unnecessary datapoints in all columns.

In [None]:
df.dtypes

In [None]:
df.gender.value_counts()

## Only 1 datapoint has'gender' marked as other, we can drop it.

In [None]:
df2 = df[df.gender != 'Other']
df2.gender.value_counts().plot(kind='pie',autopct='%1.1f%%')

In [None]:
fig, ax=plt.subplots(figsize=(20,5))
sns.countplot(x=df2.age, palette='viridis')
plt.xticks(rotation=90)
plt.xlabel('\n Age', fontsize=10, fontweight='bold')
plt.ylabel('Count of Patients', fontsize=10, fontweight='bold')
plt.title('Age of Different Patients', fontweight = 'bold', fontsize='15')
plt.show()

## Datapoints for patients younger than 2 years seem very few, we can drop it.

In [None]:
df3 = df2[~(df2['age'] <= 2)]

In [None]:
fig2, ax=plt.subplots(figsize=(20,5))
sns.countplot(x=df3.age, palette='viridis')
plt.xticks(rotation=90)
plt.xlabel('\n Age', fontsize=10, fontweight='bold')
plt.ylabel('Count of Patients', fontsize=10, fontweight='bold')
plt.title('Age of Different Patients', fontweight = 'bold', fontsize='15')
plt.show()  

### Range of age of data sample is vast.

In [None]:
stroke_0 = df3[~(df3['stroke'] == 1)]
stroke_1 = df3[~(df3['stroke'] == 0)]
sns.set(style="darkgrid")
fig3, (ax1, ax2) = plt.subplots(2,1, figsize=(15, 9))

sns.histplot(x=stroke_0['age'], kde=True, color="skyblue", ax=ax1)
sns.histplot(x=stroke_1['age'], kde=True, color="olive", ax=ax2)

### After ageof 40, the chances of Stroke increases significantly.

In [None]:
fig4, ax=plt.subplots(figsize=(20,5))
legend = ['No Stroke', 'Stroke']
sns.set(style="darkgrid")
sns.histplot(x=df3['age'], hue=df3.stroke, palette='rocket')
plt.xlabel('\n Age', fontsize=10, fontweight='bold')
plt.ylabel('Count of Patients', fontsize=10, fontweight='bold')
plt.title('Age of Different Patients', fontweight = 'bold', fontsize='15')
plt.show()  


In [None]:
df3.hypertension.value_counts().plot(kind='barh')

In [None]:
fig5, ax=plt.subplots(figsize=(5,5))
legend1 = ['No Hypertension', 'Hypertension']
sns.countplot(x=df3.gender,hue=df3.hypertension, palette='rocket')
for p in ax.patches:
    ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=10)
plt.xlabel('\n Gender & Hypertension', fontsize=10, fontweight='bold')
plt.ylabel('Count of Patients', fontsize=10, fontweight='bold')
plt.title('Hypertension across Gender', fontweight = 'bold', fontsize='15')
plt.show()  

In [None]:
fig6, ax=plt.subplots(figsize=(20,5))
ax.set(facecolor='Grey')
sns.set(style="whitegrid")
sns.histplot(x=df3['age'], hue=df3.hypertension, palette='rocket')
plt.xlabel('\n Age', fontsize=10, fontweight='bold')
plt.ylabel('Count of Patients', fontsize=10, fontweight='bold')
plt.title('Age of Different Patients v/s Hypertension', fontweight = 'bold', fontsize='15')
plt.show()  


### After age of 35, chances of Hypertension increases.

In [None]:
sns.set(style="darkgrid")
fig7, (ax1, ax2,ax3) = plt.subplots(3, 1, figsize=(15, 7))

sns.histplot(x=df3['age'], hue=df3.hypertension,kde=True, color="skyblue", ax=ax1)
ax1.set_xticks([])
ax1.set_xlabel('Age')
sns.histplot(x=df3['age'], hue=df3.stroke,kde=True, color="olive", ax=ax2)
ax2.set_xticks([])
ax2.set_xlabel(' ')
sns.histplot(x=df3['age'], hue=df3.heart_disease,kde=True, color="gold", ax=ax3)
ax3.set_xlabel('Age')
plt.title("Hypertension - Stroke - Heart Disease v/s Age", fontsize=15, fontweight='bold')

### All health issues (Heart Disease, Stroke and Hypertension) increases after around age of 40.

In [None]:
df3.ever_married.value_counts().plot(kind='bar')

In [None]:
fig8, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))

sns.set(style="darkgrid")

sns.countplot(x=stroke_0.ever_married, hue=stroke_0.gender, palette='viridis', ax=ax1)
ax1.set_xlabel('Marriage Status-No Stroke')
for p in ax1.patches:
    ax1.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=10)
ax1.set_ylabel('Count of patient')

sns.countplot(x=stroke_1.ever_married, hue=stroke_1.gender, palette='rocket', ax=ax2)
ax2.set_xlabel('Marriage Status-Stroke')
ax2.set_ylabel('')
for p in ax2.patches:
    ax2.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=10)
ax2.set_yticks([])
plt.show()  



### Female have a higher chance of having stroke be it married or not.

In [None]:
fig9, ax = plt.subplots(figsize=(5, 3))

sns.set(style="darkgrid")

sns.countplot(x=df3['Residence_type'], hue=df3.stroke, palette='viridis')
plt.title('Residence Type v/s Stroke', fontsize=15, fontweight='bold')
plt.xlabel('Residence Type',fontsize=10, fontweight='bold')
plt.ylabel('Patients',fontsize=10, fontweight='bold')
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=10)
plt.show()  

### Possibility of stroke is same beetween differentresidence types.

In [None]:
fig10, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))

sns.set(style="darkgrid")

sns.countplot(x=stroke_0.gender, hue=stroke_0.Residence_type, palette='viridis',ax=ax1)
ax1.set_xlabel('Residence Type-No Stroke',fontsize=10, fontweight='bold')
ax2.set_yticks([])
ax1.set_ylabel('Patients',fontsize=10, fontweight='bold')
for p in ax1.patches:
    ax1.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=10)
ax1.set_ylabel('Count of patient')

sns.countplot(x=stroke_1.gender, hue=stroke_1.Residence_type, palette='rocket', ax=ax2)
ax2.set_xlabel('Residence Type-Stroke',fontsize=10, fontweight='bold')
ax2.set_ylabel('')
for p in ax2.patches:
    ax2.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=10)
ax2.set_yticks([])
plt.show()  



In [None]:
fig11, ax = plt.subplots(figsize=(5, 5))
sns.set(style="darkgrid")

sns.barplot(x=df3.gender,y=df3.avg_glucose_level,hue=df3.stroke,estimator=np.average ,ci=None,palette='icefire')
plt.xlabel('Gender & Stroke',fontsize=10, fontweight='bold')
plt.ylabel('Average Glusoce Level',fontsize=10, fontweight='bold')
plt.title('Average Glusoce Level v/s Gender',fontsize=15, fontweight='bold')
for p in ax.patches:
    ax.annotate(f'\n{round(p.get_height())}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=10)

plt.show()  


### Average glucose level around 145 & 125 for Male and Female respectively increase chances of stroke. 

In [None]:
fig12, ax = plt.subplots(figsize=(15, 5))
sns.set(style="darkgrid")

sns.barplot(y=df3.smoking_status,x=df3.bmi,hue=df3.stroke,estimator=np.average ,ci=None,palette='icefire')
plt.ylabel('Smoking Habit & Stroke',fontsize=10, fontweight='bold')
plt.xlabel('Averge BMI',fontsize=10, fontweight='bold')
plt.title('Smoking Habits v/s BMI',fontsize=15, fontweight='bold')

plt.show()  


### Smoking habits seem to not be corelated with BMI.

In [None]:
fig13, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

sns.countplot(x=stroke_0.smoking_status,hue=stroke_0.gender, palette='viridis',ax=ax1)
ax1.set_xlabel('Smoking Habits-No Stroke',fontsize=10, fontweight='bold')
ax2.set_yticks([])
ax1.set_ylabel('Patients',fontsize=10, fontweight='bold')
for p in ax1.patches:
    ax1.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=10)
ax1.set_ylabel('Count of patient')

sns.countplot(x=stroke_1.smoking_status,hue=stroke_1.gender, palette='rocket',ax=ax2)
ax2.set_xlabel('Smoking Habits-Stroke',fontsize=10, fontweight='bold')
ax2.set_ylabel('')
for p in ax2.patches:
    ax2.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=10)
ax2.set_yticks([])
plt.show()  



### Women that never smoke are more likely to have stroke than those that do. This could be due to passive smoking or other reasons that are not present in the dataset here.

In [None]:
fig, plt.subplots(figsize=(20, 5))

sns.stripplot(y='smoking_status', x='bmi', data = df3,  hue='stroke')
plt.title('BMI V/s Smoking Status', fontsize=15, fontweight='bold')
plt.xlabel('BMI', fontsize=10, fontweight='bold')
plt.ylabel('Smoking Status', fontsize=10, fontweight='bold')
plt.show()  

### Smoking habits & Stroke seem to not be corelated with BMI.

In [None]:
df3.stroke.value_counts().plot(kind='pie',autopct='%1.1f%%',fontsize=17)

### The dataset has only 4% of the datapoints leading to 'Stroke' as outcome.

# 4. Feature Engineering.

In [None]:
data = df3.copy()

## 4.1. Encoding the categorical values.

In [None]:
object_cols = [col for col in data.columns if data[col].dtype == "object"]
print(*object_cols, sep=',')

In [None]:
object_nunique = list(map(lambda col: data[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

### The maximum categorical values in a column are 5, so we can use ordinal encoding.

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
data[object_cols] =ordinal_encoder.fit_transform(data[object_cols])

In [None]:
data.head(4)

In [None]:
fig15,ax = plt.subplots(figsize=(7, 5))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

In [None]:
X = data.drop('stroke',axis = 1 )
y=data.stroke

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.35, random_state=21)

## 4.2.1 Feature selection with correlation and random forest classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score

#random forest classifier with n_estimators=10 (default)
clf_rf = RandomForestClassifier(random_state=43)      
clr_rf = clf_rf.fit(X_train,y_train)

ac = accuracy_score(y_valid,clf_rf.predict(X_valid))
print('Accuracy is: ',round(ac*100), ' %')
cm = confusion_matrix(y_valid,clf_rf.predict(X_valid))
sns.heatmap(cm,annot=True,fmt="d")

### Accuracy is almost 95% and as it can be seen in confusion matrix, we make few wrong prediction. Now lets see other feature selection methods to find better results.

## 4.2.2. Univariate feature selection and random forest classification.

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# find best scored 5 features
select_feature = SelectKBest(chi2, k=5).fit(X_train, y_train)
print('Score list:', select_feature.scores_)
print('Feature list:', X_train.columns)

### Best 5 feature to classify are hypertension, heart disease, married status, bmi and smoking status. So lets see what happens if we use only these best scored 5 feature.

In [None]:
X_train_2 = select_feature.transform(X_train)
X_valid_2 = select_feature.transform(X_valid)
#random forest classifier with n_estimators=10 (default)
clf_rf_2 = RandomForestClassifier()      
clr_rf_2 = clf_rf_2.fit(X_train_2,y_train)
ac_2 = accuracy_score(y_valid,clf_rf_2.predict(X_valid_2))
print('Accuracy is: ',round(ac_2*100), ' %')
cm_2 = confusion_matrix(y_valid,clf_rf_2.predict(X_valid_2))
sns.heatmap(cm_2,annot=True,fmt="d")

### Accuracy is reduced to 94% and as it can be seen in confusion matrix, we make few wrong prediction.Although we use 5 features in selectkBest method accuracies degraded. Now lets see other feature selection methods to find better results.

## 4.2.3. Recursive feature elimination (RFE) with random forest

In [None]:
from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
clf_rf_3 = RandomForestClassifier()      
rfe = RFE(estimator=clf_rf_3, n_features_to_select=5, step=1)
rfe = rfe.fit(X_train, y_train)
print('Chosen best 5 feature by rfe:',X_train.columns[rfe.support_])

In [None]:
rfe_col=X_train.columns[rfe.support_]
rfe_col.values.tolist()

In [None]:
X_train_3 = X_train[rfe_col]
X_valid_3 = X_valid[rfe_col]
#random forest classifier with n_estimators=10 (default)
clf_rf_3 = RandomForestClassifier()      
clr_rf_3 = clf_rf_3.fit(X_train_3,y_train)
ac_3 = accuracy_score(y_valid,clf_rf_3.predict(X_valid_3))
print('Accuracy is: ',round(ac_3*100), ' %')
cm_3 = confusion_matrix(y_valid,clf_rf_3.predict(X_valid_3))
sns.heatmap(cm_3,annot=True,fmt="d")

### This model still gave an accuracy of 96%, and performed slightly better to detect stroke value "1".

## 4.2.4. Recursive feature elimination with cross validation and random forest classification

In [None]:
from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct classifications
clf_rf_4 = RandomForestClassifier() 
rfecv = RFECV(estimator=clf_rf_4, step=1, cv=7,scoring='accuracy')
rfecv = rfecv.fit(X_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X.columns[rfecv.support_])

In [None]:
rfecv_col=X_train.columns[rfecv.support_]
rfecv_col.values.tolist()

X_train_4 = X_train[rfecv_col]
X_valid_4 = X_valid[rfecv_col]

rfecv_1 = rfecv.fit(X_train_4, y_train)

In [None]:
ac_4 = accuracy_score(y_valid,rfecv_1.predict(X_valid_4))
print('Accuracy is: ',round(ac_4*100), ' %')
cm_4 = confusion_matrix(y_valid,rfecv_1.predict(X_valid_4))
sns.heatmap(cm_4,annot=True,fmt="d")

In [None]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

## 4.2.5. Tree based feature selection and random forest classification

In [None]:
clf_rf_5 = RandomForestClassifier()      
clr_rf_5 = clf_rf_5.fit(X,y)
importances = clr_rf_5.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf_rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest

plt.figure(1, figsize=(14, 5))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="g", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices],rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()

## Since, all models are predicting the True Negatives close to zero, we will have to try other algorithms.

# 4.3. Model Selection.

## 4.3.1. Support Vector Machine.

In [None]:
from sklearn import svm
svm = svm.SVC(gamma='auto',C=10,kernel='linear')
svm = svm.fit(X_train,y_train)
ac_svm = accuracy_score(y_valid,svm.predict(X_valid))
print('Accuracy is: ',round(ac_svm*100), ' %')
cm_svm = confusion_matrix(y_valid,svm.predict(X_valid))
sns.heatmap(cm_svm,annot=True,fmt="d")

## 4.3.2. Logistic Regression.

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(solver='liblinear',class_weight='balanced',multi_class='auto', C=80)
lr = lr.fit(X_train,y_train)
ac_lr = accuracy_score(y_valid,lr.predict(X_valid))
print('Accuracy is: ',round(ac_lr*100), ' %')
cm_lr = confusion_matrix(y_valid,lr.predict(X_valid))
sns.heatmap(cm_lr,annot=True,fmt="d")

## 4.3.3. Multinomial Naive Bayes.

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb = mnb.fit(X_train,y_train)
ac_mnb = accuracy_score(y_valid,mnb.predict(X_valid))
print('Accuracy is: ',round(ac_mnb*100), ' %')
cm_mnb = confusion_matrix(y_valid,mnb.predict(X_valid))
sns.heatmap(cm_mnb,annot=True,fmt="d")

## 4.3.4. Gaussian Naive Bayes.

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb = gnb.fit(X_train,y_train)
ac_gnb = accuracy_score(y_valid,gnb.predict(X_valid))
print('Accuracy is: ',round(ac_gnb*100), ' %')
cm_gnb = confusion_matrix(y_valid,gnb.predict(X_valid))
sns.heatmap(cm_gnb,annot=True,fmt="d")

### Gaussian Naive Bayes returned the most correct predictions for True Negatives.
### The accuracy is 86%, which can be improved with feature selection.

# 4.4. Feture Selction with Gaussian Naive Bayes.

## 4.4.1 Univariate feature selection and Gaussian Naive Bayes.

In [None]:
# As we already have found top 5 features, we will use them with Gaussian Naive Byes.
gnb2 = GaussianNB()       
gnb2 = gnb2.fit(X_train_2,y_train)
ac_gnb2 = accuracy_score(y_valid,gnb2.predict(X_valid_2))
print('Accuracy is: ',round(ac_gnb2*100), ' %')
cm_gng2 = confusion_matrix(y_valid,gnb2.predict(X_valid_2))
sns.heatmap(cm_gng2,annot=True,fmt="d")

### Accuracy is still 86%, but predictions of True Negatives have improved.

## 4.4.2.. Recursive feature elimination (RFE) with GaussianNB.

In [None]:
from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
gnb3 = GaussianNB()      
rfe_2 = RFE(estimator=gnb3, n_features_to_select=5, step=1)
rfe = rfe.fit(X_train, y_train)
print('Chosen best 5 feature by rfe:',X_train.columns[rfe.support_])

### The features suggested for GuassianNB are same as that for Random Forrest Classifier. 

In [None]:
gnb4 = GaussianNB()      
gnb4 = gnb4.fit(X_train_3,y_train)
ac_gnb4 = accuracy_score(y_valid,gnb4.predict(X_valid_3))
print('Accuracy is: ',round(ac_gnb4*100), ' %')
cm_gnb4 = confusion_matrix(y_valid,gnb4.predict(X_valid_3))
sns.heatmap(cm_gnb4,annot=True,fmt="d")

### The accuracy increased to 91%, but correct predictions of True Negatives reduced.

# 5. Conclusion.

## Though Random forrest classifier gave an accuracy of more than 90%, Gaussian Naive Bayes predict most correct True Negatives, which is the main result required from the model.