In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing libraries

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score

In [None]:
df = pd.read_csv("/kaggle/input/health-care-data-set-on-heart-attack-possibility/heart.csv")
df.head()

### Step 1:Inspecting dataframe

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#listing categorical variables as per the data definition and converting them to category
catvar=['sex','cp','fbs','restecg','exang','ca','thal', 'slope']
df[catvar]=df[catvar].astype('category')

In [None]:
df.dtypes

##### From above we see that there are no missing values and all the columns have correct datatype

### Step2:Data Preparation

2.1: Outlier Treatment

In [None]:
df.describe()

##### Columns suspected to have outliers are restbps,chol,thalach,oldpeak

In [None]:
#create boxplots to inspect the presence of outliers
num_cols=['trestbps','chol','thalach','oldpeak']
#  plot Numerical Data
a = 2  # number of rows
b = 2  # number of columns
c = 1  # initialize plot counter

fig = plt.figure(figsize=(20,30))
for i in num_cols:
    plt.subplot(a, b, c)
    plt.title('{} (box)'.format(i, a, b, c))
    plt.xlabel(i)
    plt.boxplot(x = df[i])
    c = c + 1
plt.show()

In [None]:
#function to remove outliers using IQR
def subset_by_iqr(df, column, whisker_width=1.5):
   # Calculate Q1, Q2 and IQR
    q1 = df[column].quantile(0.25)                 
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    filter = (df[column] >= q1 - whisker_width*iqr) & (df[column] <= q3 + whisker_width*iqr)
    return df.loc[filter]                                                     


In [None]:
for i in num_cols:
    df = subset_by_iqr(df, i)

In [None]:
df.shape

#### 2.2 :Dummy Variable Creation for categorical variables

In [None]:
df.head()

Let us perform one-hot encoding for these variables with multiple levels

In [None]:
#creating dummy variable and dropping the first one
df1=pd.get_dummies(df[['sex','cp','fbs','restecg','exang','ca','thal','slope']], drop_first=True)
#adding results to original df
df=pd.concat([df,df1],axis=1)
df.head()
df.columns

In [None]:
#Dropping the repeated variables
df=df.drop(['sex','cp','fbs','restecg','exang','ca','thal','slope'],1)

In [None]:
df.shape
df.columns

#### 2.3: Test-Train Split

In [None]:
#dropping target variable from X
X=df.drop('target', 1)

In [None]:
X.head()

In [None]:
#putting target variable to y
y=df['target']
y.head()

In [None]:
#Splitting data into train and test set
X_train,X_test, y_train,y_test=train_test_split(X,y, train_size=0.7,test_size=0.3, random_state=100)

#### 2.4 :Feature Scaling

In [None]:
scaler=StandardScaler()

In [None]:
numeric_col=['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
X_train[numeric_col]=scaler.fit_transform(X_train[numeric_col])
X_train.head()

In [None]:
##Checking the rate of heart attack
risk=(sum(df['target'])/len(df['target'].index))*100
risk

There is almost 56% risk of suffering heart attack in the given dataset and this looks like a fairly balanced dataset

##### 2.5: Looking at correlations

In [None]:
#lets see the correlation matrix for the entire dataset
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True)
plt.show()

##### Dropping highly correlated dummy variables

In [None]:
X_train=X_train.drop(['exang_1','thal_2','slope_2'],1)
X_test=X_test.drop(['exang_1','thal_2','slope_2'],1)

##### Checking the correlation matrix again

After dropping the highly correlated dummy variables let us again check the correlation matrix

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(X_train.corr(),annot=True)
plt.show()

### Step3 :Model Building

In [None]:
#Logistic Regression Model
logm=sm.GLM(y_train,(sm.add_constant(X_train)), family=sm.families.Binomial())
logm.fit().summary()

### Step4: Feature Selection Using RFE

In [None]:
logreg=LogisticRegression()

In [None]:
rfe=RFE(logreg,13)
rfe=rfe.fit(X_train, y_train)

In [None]:
rfe.support_

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col=X_train.columns[rfe.support_]
col

##### Assessing the model with statsmodels

In [None]:
X_train_sm=sm.add_constant(X_train[col])

In [None]:
logm1=sm.GLM(y_train, X_train_sm,famiily=sm.families.Binomial())
res=logm1.fit()
res.summary()

##### Checking VIF

In [None]:
vif=pd.DataFrame()
vif['Features']=X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

All the variables have pretty low VIF and thus do not indicate multicollinearity. However the variable ca_4 has a significantly high p-value and hence can be dropped

In [None]:
col=col.drop('ca_4',1)
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

the variable thal_1 has a significantly high p-value and hence can be dropped

In [None]:
col=col.drop('thal_1',1)
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

the variable ca_3 has a significantly high p-value and hence can be dropped

In [None]:
col=col.drop('ca_3',1)
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

the variable restecg_1 has a significantly high p-value and hence can be dropped

In [None]:
col=col.drop('restecg_1',1)
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm5 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm5.fit()
res.summary()

We see that the p-values for all the variables are significantly low.Let us again check the VIF

In [None]:
vif=pd.DataFrame()
vif['Features']=X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Since the p-values of all the variables are less than 0.05 and also the VIF are pretty low we can consider this model as final model and free from any multicollinearity

##### Predicting on training set

In [None]:
y_train_pred=res.predict(X_train_sm).values.reshape(-1)
y_train_pred[:10]

##### Creating a dataframe with actual risk probabilities and the predicted probabilities

In [None]:
y_train_pred_final = pd.DataFrame({'Risk':y_train.values, 'Risk_Prob':y_train_pred})

y_train_pred_final.head()

##### Creating new column 'predicted' with 1 if Risk_Prob > 0.5 else 0

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Risk_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

### Step5:Model Evaluation

##### Confusion Matrix

In [None]:
confusion=metrics.confusion_matrix(y_train_pred_final.Risk,y_train_pred_final.predicted)
print(confusion)

##### Overall Accuracy of the model

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Risk,y_train_pred_final.predicted))

Since the problem at hand is to predict the probability of heart attack, we would like to maximise the sensitivity. We would not like to classify a person having high risk as one having low risk

In [None]:
TP=confusion[1,1]
TN=confusion[0,0]
FP=confusion[0,1]
FN= confusion[1,0]

##### Sensitivity

In [None]:
TP/float(TP+FN)

##### Specificity

In [None]:
TN/float(TN+FP)

##### Clearly sensitivity is higher than specificity with our current model and threshold for prediction defined at 0.5

### Step6:Plotting the ROC Curve

An ROC curve demonstrates several things:

- It shows the tradeoff between sensitivity and specificity (any increase in sensitivity will be accompanied by a decrease in specificity).
- The closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.
- The closer the curve comes to the 45-degree diagonal of the ROC space, the less accurate the test.

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Risk, y_train_pred_final.Risk_Prob, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Risk, y_train_pred_final.Risk_Prob)

94% of the area is under the ROC curve .

### Step7:Checking for Optimal Cut-off Point

Optimal cutoff probability is that prob where we get balanced sensitivity and specificity

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Risk_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Risk, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

##### From the above we see that cutoff of 0.5 which we had chosen earlier is the optimum cutof point

### Step8:Precision and Recall

##### Precision

In [None]:
precision_score(y_train_pred_final.Risk, y_train_pred_final.predicted)

##### Recall

In [None]:
recall_score(y_train_pred_final.Risk, y_train_pred_final.predicted)

### Step9:Making Predictions on the Test Set

In [None]:
X_test[numeric_col]=scaler.transform(X_test[numeric_col])
X_test = X_test[col]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)

#### Making predictions on the test set

In [None]:
y_test_pred = res.predict(X_test_sm)

In [None]:
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [None]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Risk_Prob','target' : 'Risk'})

In [None]:
# Let's see the head of y_pred_final
y_pred_final.head()

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Risk_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_pred_final.head()

##### Accuracy

In [None]:
metrics.accuracy_score(y_pred_final.Risk, y_pred_final.final_predicted)

##### Confusion Matrix on Test Set

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final.Risk, y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

##### Sensitivity

In [None]:
TP/float(TP+FN)

##### Specificity

In [None]:
TN/float(TN+FP)

### Conclusion

In the training set sensitivity is 89% and specificity is 85% <br>
In the test set sensistivity is 84% and specificity is 81%

The features that affect the probability of a heart attack are <br>
oldpeak,sex_1,cp_1,cp_2,cp_3,ca_1,ca_2,thal_3,slope_1