## Name: Jay Shah

## Date: 29-6-2021

# Stroke Prediction using Logistic Regression

In [None]:
import numpy as np
import pandas as pd
import plotly.offline as pyo
pyo.init_notebook_mode()

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df

# Checking the null values in whole dataset

In [None]:
df.isnull().sum(axis=0)

# Calculating the unique values in columns

In [None]:
print("Total unique values in Residence_type column are: ",df['Residence_type'].nunique())
print("Unique values are: ",df['Residence_type'].unique())

In [None]:
print("Total unique values in work_type column are: ",df['work_type'].nunique())
print("Unique values are: ",df['work_type'].unique())

In [None]:
print("Total unique values in ever_married column are: ",df['ever_married'].nunique())
print("Unique values are: ",df['ever_married'].unique())

In [None]:
print("Total unique values in smoking_status column are: ",df['smoking_status'].nunique())
print("Unique values are: ",df['smoking_status'].unique())

In [None]:
print("Total unique values in smoking_status column are: ",df['smoking_status'].nunique())
print("Unique values are: ",df['smoking_status'].unique())

# Calculating the total values of each category in above columns

In [None]:
x = df['Residence_type'].value_counts()
x = x.reset_index()
x

# Checking the smoking status of the people

In [None]:
interim_data = df['smoking_status'].value_counts().rename_axis('Smoking-Status').reset_index(name='Counts in each category of smoking status')
interim_data

In [None]:
import plotly.express as px
fig = px.bar(interim_data,x='Smoking-Status',y='Counts in each category of smoking status',title='Category of people in smoking',hover_data=['Smoking-Status', 'Counts in each category of smoking status'], color='Smoking-Status',width=900,height=700)
fig.update_xaxes(type='category')
fig.show()

### It is visible from above graph that out of 5110 people there are 1892 people who have never smoked.

# Calculating the percentage average level of glucose in male and female

In [None]:
import plotly.express as px
fig = px.pie(df, values='avg_glucose_level', names='gender',title='Average level of glucose in Males and Females',width=800,color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

### From the above figure it is clearly visible that average level of glucose is more in females and negligible in other category

# Calculating the percentage average level of glucose according to residence type

In [None]:
fig = px.pie(df, values='avg_glucose_level', names='Residence_type', title='Average level of glucose residence wise',width=800)
fig.show()

### It is apparent from the above figure that the average level of glucose is approximately same in both urban and rural areas.

In [None]:
interim_data = df['work_type'].value_counts().rename_axis('Type of Work').reset_index(name='Number of people working in each category')
interim_data


In [None]:
fig = px.bar(interim_data,x='Type of Work',y='Number of people working in each category',title='Number of people in working category',hover_data=['Type of Work', 'Number of people working in each category'], color='Type of Work',width=900,height=700)
fig.update_xaxes(type='category')
fig.show()

### So from above graph we can tell that majority of the people are having private jobs

# Plotting the figure to find which gender is suffering more from hypertension and heart disease

In [None]:
interim_df = df.filter(['gender','heart_disease','hypertension'])
interim_df

In [None]:
interim_df = interim_df.groupby('gender').count().reset_index()
interim_df

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name='People suffering from heart disease', x=interim_df['gender'], y=interim_df['heart_disease']),
    go.Bar(name='People suffering from hypertension', x=interim_df['gender'], y=interim_df['hypertension'])
])
# Change the bar mode
fig.update_layout(barmode='group',height=700, width=1000,title='Number of males and females suffering from diseases')
fig.show()

### From above graph we can draw a conclusion that females are more reluctant to diseases related to heart and hypertension when compared to males.

# Checking how many people are married or not

In [None]:
interim_data = df['ever_married'].value_counts().reset_index()
interim_data.rename(columns={'index':'Status whether people are married or not'},inplace=True)
fig = px.pie(interim_data,values='ever_married', names='Status whether people are married or not', title='Percentage of people married or not',width=800)
fig.show()

### We can clearly justify from above graph that approximately 65% of people are married.

# Printing the original dataframe

In [None]:
df

### As we know that bmi column has NA values we need to replace those values. Instead of removing the rows, we can calculate the NA values by applying mean, median or mode to the column. Below, I am going to apply mean to the whole column and therefore all the NA values will be replaced.

In [None]:
df['bmi'].mean()

In [None]:
df['bmi'].fillna(df['bmi'].mean(),inplace=True)
df

In [None]:
import plotly.express as px
fig = px.scatter(df, x="age", y="bmi",height=800,width=1000,title='Body Mass Index in various age groups')
fig.show()

# Feature Selection Process

### From the dataframe which is printed above two cells we have to remove the 'id' column in order to train the model. Secondly we will need to convert categorical data to numerical data. Column named: gender,ever_married,work_type,Residence_type & smoking-status are nominal categorical variables and hence we can apply one hot encoding to convert it into numeric data. Let us first remove the ID column

In [None]:
df.drop(columns=['id'],inplace=True)
df

### Detecting dependency between categorical variables by using Pearson's Chi-Square Test. By performing the test we will be able to decide whether we need to keep the particular column or not.

### Pearson's Chi-Square Test: The Chi-Squared test is a statistical hypothesis test that assumes (the null hypothesis) that the observed frequencies for a categorical variable match the expected frequencies for the categorical variable. The test calculates a statistic that has a chi-squared distribution.

### Below the chi-square test is performed between tow categorical variables called 'gender' and 'ever_married'.

In [None]:
def create_contingency_table(dataframe):
    table=[]
    for i in range(len(dataframe)):
        col = []
        for j in range(len(dataframe.columns)):
            col.append(dataframe[dataframe.columns[j]][i])
        table.append(col)
    return table


def chi_square_test(table):

    from scipy.stats import chi2_contingency
    from scipy.stats import chi2
    stat, p, dof, expected = chi2_contingency(table)
    print('Degree of freedom: ', dof)
    print('Stat is: ', stat)
    print('P-value is: ',p)
    print('Expected frquencies: ',expected)
    
    # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('Critical value=%.3f, Stat=%.3f' % (critical, stat))
    if abs(stat) >= critical:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')
    
    # interpret p-value
    alpha = 1.0 - prob
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')

In [None]:
data_gender_married = pd.crosstab(df['gender'],df['ever_married'])
print(data_gender_married)
table = create_contingency_table(data_gender_married)
print(table)
chi_square_test(table)

### From the above output we came to know that gender and ever_married columns are dependent on each other, therefore we can discard the ever_married columns as a part of feature selection process. 

In [None]:
dummies_gender = pd.get_dummies(df['gender'])
dummies_work_type = pd.get_dummies(df['work_type'])
dummies_residence_type = pd.get_dummies(df['Residence_type'])
dummies_smoking_status = pd.get_dummies(df['smoking_status'])

In [None]:
final_df = pd.concat([df,dummies_gender,dummies_work_type,dummies_residence_type,dummies_smoking_status],axis='columns')
final_df

### Now from the above dataframe we can now drop the old categorical values as they are being converted to numeric data through one hot encoding. 

In [None]:
final_df.drop(columns=['gender','ever_married','work_type','Residence_type','smoking_status'],inplace=True)
final_df

# Building Logistic Regression Model for prediction

### From the above dataframe we know that Avergae glucose level and body mass index has very high values and hence we need to scale those values between 0 and 1. Hence, we will apply Standard Scaler from Scikit-learn to transform the data.

### Standardization: Standardization of datasets is a common requirement for many machine learning estimators implemented in scikit-learn; they might behave badly if the individual features do not more or less look like standard normally distributed data: Gaussian with zero mean and unit variance. In practice we often ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.

### The preprocessing module provides the StandardScaler utility class, which is a quick and easy way to perform the following operation on an array-like dataset.

In [None]:
y = final_df['stroke']
X = final_df.drop(columns=['stroke'])

### Splitting the data set into training and testing.

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=42)
print('Length of training and testin data is: ')
print('X_train=%.3f, X_test=%.3f, y_train=%3f, y_test=%.3f' % (len(X_train),len(X_test),len(y_train),len(y_test)))

### Building the model on training data

In [None]:
from sklearn.linear_model import LogisticRegression as lr
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), lr())
pipe.fit(X_train,y_train)

### Model accuracy on test data

In [None]:
print("Model Accuracy on Testing Data: ", pipe.score(X_test,y_test))

### Predictions on test data

In [None]:
pipe.predict(X_test)

### Probabilities that whether the person has chances of stroke or not on testing data

In [None]:
pipe.predict_proba(X_test)

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = pipe.predict(X_test)
confusion_matrix(y_test,y_pred)

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(pipe,X_test,y_test)

### From the above figure of confusion matrix we can see that total 1931 elements are measured correctly by the classifier whereas 113 elements are mislabeled.

# Classification Report

In [None]:
from sklearn.metrics import classification_report
target_names = ['Person does not have chances of stroke', 'Person has chances of stroke']
print(classification_report(y_test, y_pred, target_names=target_names))