In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

In [None]:
sp_df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
sp_df.head()

In [None]:
sp_df.shape

In [None]:
sp_df.info()

In [None]:
sp_df['stroke'].describe()

In [None]:
sns.distplot(sp_df['stroke'])

In [None]:
sns.distplot(sp_df['bmi'])

In [None]:
sns.distplot(sp_df['avg_glucose_level'])

In [None]:
sns.distplot(sp_df['heart_disease'])

In [None]:
sns.pairplot(sp_df)

In [None]:
#skewness and kurtosis
print(f"Skewness: {sp_df['stroke'].skew()}" )
print(f"Kurtosis: {sp_df['stroke'].kurt()}" )

In [None]:
#correlation matrix
corrmat = sp_df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8,annot=True,square=True);

In [None]:
#missing data
total = sp_df.isnull().sum().sort_values(ascending=False)
percent = (sp_df.isnull().sum()/sp_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
def missing_zero_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = round(df.isnull().mean().mul(100), 2)
        mz_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {df.index.name:'col_name', 0 : 'Missing Values', 1 : '% of Total Values'})
        mz_table['Data_type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0 ].sort_values(
        '% of Total Values', ascending=False)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
        return mz_table.reset_index()


In [None]:
missing = missing_zero_values_table(sp_df)
missing[:20].style.background_gradient(cmap='Reds')

In [None]:
#filling null values with the mean
sp_df['bmi'].fillna(sp_df['bmi'].mean(), inplace= True)

In [None]:
sp_df.isnull().sum()

In [None]:
#import seaborn as sns
#sns.countplot(sp_df['stroke'])


# Plot the value counts with a bar graph
sp_df.stroke.value_counts().plot(kind="bar", color=["salmon", "lightblue"]);

In [None]:
sp_df.head()

In [None]:
#Heart Disease Frequency according to Gender

sp_df.gender.value_counts()

In [None]:
# Compare target column with sex column
pd.crosstab(sp_df.stroke,sp_df.gender)

In [None]:
# Create a plot
pd.crosstab(sp_df.stroke, sp_df.gender).plot(kind="bar", 
                                    figsize=(10,6), 
                                    color=["salmon", "lightblue","crimson"]);

plt.title("Stroke Frequencey based on Gender")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.ylabel("Amount")
plt.legend(["Female", "Male","Other"])
plt.xticks(rotation=0); #  labels on the x-axis is kept vertical

In [None]:
sp_df['smoking_status'].value_counts()

In [None]:
# Create a plot
pd.crosstab(sp_df.stroke, sp_df.smoking_status).plot(kind="bar", 
                                    figsize=(10,6), 
                                    color=["salmon", "lightblue","crimson","orange"]);
plt.title("Stroke Frequencey based on Smoking Status")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.ylabel("Amount")
plt.xticks(rotation=0); 

In [None]:
sp_df['work_type'].value_counts()

In [None]:
# Create a plot
pd.crosstab(sp_df.stroke, sp_df.work_type).plot(kind="bar", 
                                    figsize=(10,6), 
                                    color=["salmon", "lightblue","crimson","orange","blue"]);
plt.title("Stroke Frequencey based on Work Type")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.ylabel("Amount")
plt.xticks(rotation=0); 

In [None]:
sp_df["hypertension"].value_counts()

In [None]:
# Create a plot
pd.crosstab(sp_df.stroke, sp_df.hypertension).plot(kind="bar", 
                                    figsize=(10,6), 
                                    color=["salmon", "crimson",]);
plt.title("Stroke Frequencey based on Hyper Tension")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.legend(["HyperTension","No HyperTension"])
plt.ylabel("Amount")
plt.xticks(rotation=0); 

In [None]:
sp_df["heart_disease"].value_counts()

In [None]:
# Create a plot
pd.crosstab(sp_df.stroke, sp_df.hypertension).plot(kind="bar", 
                                    figsize=(10,6), 
                                    color=["salmon", "crimson",]);
plt.title("Stroke Frequencey based on Heart Disease")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.legend(["HeartDisease","No HeartDisease"])
plt.ylabel("Amount")
plt.xticks(rotation=0); 

In [None]:
sp_df["Residence_type"].value_counts()

In [None]:
# Create a plot
pd.crosstab(sp_df.stroke, sp_df.Residence_type).plot(kind="bar", 
                                    figsize=(10,6), 
                                    color=["salmon", "crimson",]);
plt.title("Stroke Frequencey based on Residence type")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.ylabel("Amount")
plt.xticks(rotation=0); 

In [None]:
sp_df["ever_married"].value_counts()

In [None]:
# Create a plot
pd.crosstab(sp_df.stroke, sp_df.ever_married).plot(kind="bar", 
                                    figsize=(10,6), 
                                    color=["salmon", "crimson",]);
plt.title("Stroke Frequencey based on Martial Status")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.legend(["Married","Not Married"])
plt.ylabel("Amount")
plt.xticks(rotation=0); 

In [None]:
sp_df.head()

In [None]:
#Handling Categorical Variables

from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
sp_df['gender'] = label.fit_transform(sp_df['gender'])
sp_df['ever_married'] = label.fit_transform(sp_df['ever_married'])
sp_df['work_type']= label.fit_transform(sp_df['work_type'])
sp_df['Residence_type']= label.fit_transform(sp_df['Residence_type'])


In [None]:
sp_df['smoking_status'] = label.fit_transform(sp_df['smoking_status'])

In [None]:
#standardizing the dataset with Standard Scaler
from sklearn.preprocessing import StandardScaler 
  
scalar = StandardScaler() 
  
scalar.fit(sp_df) 
scaled_data = scalar.transform(sp_df)

In [None]:
sp_df.head()

## Logistic Regression

In [None]:
# modelling
# Reference : https://www.kaggle.com/neisha/heart-disease-prediction-using-logistic-regression

from statsmodels.tools import add_constant as add_constant

stroke_df_constant = add_constant(sp_df)
stroke_df_constant.head()

In [None]:
import statsmodels.api as sm
import scipy.stats as st

st.chisqprob = lambda chisq, df: st.chi2.sf(chisq, df)
cols = stroke_df_constant.columns[:-1]
model = sm.Logit(sp_df.stroke,stroke_df_constant[cols])
result = model.fit()
result.summary()

### Feature Selection: Backward elemination (P-value approach)



In [None]:
def back_feature_elem (data_frame,dep_var,col_list):
    """ Takes in the dataframe, the dependent variable and a list of column names, runs the regression repeatedly eleminating feature with the highest
    P-value above alpha one at a time and returns the regression summary with all p-values below alpha"""

    while len(col_list)>0 :
        model=sm.Logit(dep_var,data_frame[col_list])
        result=model.fit(disp=0)
        largest_pvalue=round(result.pvalues,3).nlargest(1)
        if largest_pvalue[0]<(0.05):
            return result
            break
        else:
            col_list=col_list.drop(largest_pvalue.index)

result=back_feature_elem(stroke_df_constant,sp_df.stroke,cols)

In [None]:
result.summary()

### Interpreting the results: Odds Ratio, Confidence Intervals and Pvalues

In [None]:
params = np.exp(result.params)
conf = np.exp(result.conf_int())
conf['OR'] = params
pvalue=round(result.pvalues,3)
conf['pvalue']=pvalue
conf.columns = ['CI 95%(2.5%)', 'CI 95%(97.5%)', 'Odds Ratio','pvalue']
print ((conf))

In [None]:
y = sp_df['stroke']
X = sp_df.drop(['stroke'],axis=1)


In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=.20,random_state=5)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(x_train,y_train)
y_pred=logreg.predict(x_test)

### Model Evaluation 
#### Model Accuracy

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

#### Accuracy of the model is 94 %

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")

The confusion Matrix shows 967 correct predictions and 52+3=55 incorrect ones

### True Positives : 0
### True Negatives : 967
### False Positives: 3 (Type 1 Error)
### False Negatives: 52 (Type 2 Error)



In [None]:
TN=cm[0,0]
TP=cm[1,1]
FN=cm[1,0]
FP=cm[0,1]
sensitivity=TP/float(TP+FN)
specificity=TN/float(TN+FP)

### Model Evaluation - Statistics

In [None]:
print('The acuuracy of the model = TP+TN/(TP+TN+FP+FN) = ',(TP+TN)/float(TP+TN+FP+FN),'\n',

'The Missclassification = 1-Accuracy = ',1-((TP+TN)/float(TP+TN+FP+FN)),'\n',

'Sensitivity or True Positive Rate = TP/(TP+FN) = ',TP/float(TP+FN),'\n',

'Specificity or True Negative Rate = TN/(TN+FP) = ',TN/float(TN+FP),'\n',

'Positive Predictive value = TP/(TP+FP) = ',TP/float(TP+FP),'\n',

'Negative predictive Value = TN/(TN+FN) = ',TN/float(TN+FN),'\n',

'Positive Likelihood Ratio = Sensitivity/(1-Specificity) = ',sensitivity/(1-specificity),'\n',

'Negative likelihood Ratio = (1-Sensitivity)/Specificity = ',(1-sensitivity)/specificity)

From the above statistics it is clear that the model is highly specific than sensitive. The negative values are predicted more accurately than the positives.

In [None]:
y_pred_prob=logreg.predict_proba(x_test)[:,:]
y_pred_prob_df=pd.DataFrame(data=y_pred_prob, columns=['Prob of no Stroke(0)','Prob of Stroke (1)'])
y_pred_prob_df.head()

### Lower the threshold

In [None]:
from sklearn.preprocessing import binarize
for i in range(1,5):
    cm2=0
    y_pred_prob_yes=logreg.predict_proba(x_test)
    y_pred2=binarize(y_pred_prob_yes,i/10)[:,1]
    cm2=confusion_matrix(y_test,y_pred2)
    print ('With',i/10,'threshold the Confusion Matrix is ','\n',cm2,'\n',
            'with',cm2[0,0]+cm2[1,1],'correct predictions and',cm2[1,0],'Type II errors( False Negatives)','\n\n',
          'Sensitivity: ',cm2[1,1]/(float(cm2[1,1]+cm2[1,0])),'Specificity: ',cm2[0,0]/(float(cm2[0,0]+cm2[0,1])),'\n\n\n')
    

### ROC Curve

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_yes[:,1])
plt.plot(fpr,tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Stroke disease classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

### Area under the curve

In [None]:
from sklearn import metrics
metrics.roc_auc_score(y_test,y_pred_prob_yes[:,1])

### k-NN

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

knn_scores = []
for k in range(1,21):
    knn_classifier = KNeighborsClassifier(n_neighbors = k)
    score=cross_val_score(knn_classifier,X,y,cv=10)
    knn_scores.append(score.mean())


In [None]:

plt.plot([k for k in range(1, 21)], knn_scores, color = 'red')
for i in range(1,21):
    plt.text(i, knn_scores[i-1], (i, knn_scores[i-1]))
plt.xticks([i for i in range(1, 21)])
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Scores')
plt.title('K Neighbors Classifier scores for different K values')

In [None]:

knn_classifier = KNeighborsClassifier(n_neighbors = 12)
score=cross_val_score(knn_classifier,X,y,cv=10)

In [None]:
score.mean()

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
randomforest_classifier= RandomForestClassifier(n_estimators=10)

score=cross_val_score(randomforest_classifier,X,y,cv=10)

In [None]:
score.mean()

## Reference 

1. https://www.kaggle.com/salmaeng/statistical-analysis-eda
2. https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
3. https://www.kaggle.com/swatis1/stroke-prediction
4. https://www.kaggle.com/neisha/heart-disease-prediction-using-logistic-regression    
