### **About this dataset**

Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic
trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal

Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

In [None]:
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
heart=pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
heart.head()

In [None]:
heart.shape

In [None]:
heart.info()

In [None]:
heart.describe()

In [None]:
heart.describe(percentiles=[0.25,0.50,0.75,0.90,0.99]).T

## Model Building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y=heart.pop('output')

In [None]:
X=heart

In [None]:
# train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y, test_size=0.3, random_state=100)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler= MinMaxScaler()

In [None]:
col= ['age','cp','trtbps','chol','restecg','thalachh','oldpeak','slp','caa','thall']

In [None]:
# scaling the col
X_train[col]= scaler.fit_transform(X_train[col])

In [None]:
X_train.head()

In [None]:
import statsmodels.api as sm

In [None]:
lr01= sm.GLM(y_train, sm.add_constant(X_train), family= sm.families.Binomial())

In [None]:
print(lr01.fit().summary())

In [None]:
# dropping 'age' because it is insignificant 
X_train.drop('age',1,inplace=True)

In [None]:
lr02= sm.GLM(y_train, sm.add_constant(X_train), family= sm.families.Binomial())

In [None]:
print(lr02.fit().summary())

In [None]:
# dropping 'restecg'
X_train.drop('restecg',1, inplace=True)

In [None]:
lr03= sm.GLM(y_train, sm.add_constant(X_train), family= sm.families.Binomial())

In [None]:
print(lr03.fit().summary())

In [None]:
# dropping 'fbs'
X_train.drop('fbs',1, inplace=True)

In [None]:
lr04= sm.GLM(y_train, sm.add_constant(X_train), family= sm.families.Binomial())

In [None]:
print(lr04.fit().summary())

In [None]:
# dropping 'exng'
X_train.drop('exng',1, inplace=True)

In [None]:
lr05= sm.GLM(y_train, sm.add_constant(X_train), family= sm.families.Binomial())

In [None]:
print(lr05.fit().summary())

In [None]:
# dropping 'trtbps'
X_train.drop('trtbps',1, inplace=True)

In [None]:
lr06= sm.GLM(y_train, sm.add_constant(X_train), family= sm.families.Binomial())

In [None]:
print(lr06.fit().summary())

In [None]:
# dropping 'chol'
X_train.drop('chol',1, inplace=True)

In [None]:
lr07= sm.GLM(y_train, sm.add_constant(X_train), family= sm.families.Binomial())

In [None]:
print(lr07.fit().summary())

In [None]:
# dropping 'slp'
X_train.drop('slp',1, inplace=True)

In [None]:
lr08= sm.GLM(y_train, sm.add_constant(X_train), family= sm.families.Binomial())

In [None]:
print(lr08.fit().summary())

In [None]:
# checking VIFs
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# dropping 'thall' because VIF>10
X_train.drop('thall',1, inplace=True)

In [None]:
lr09= sm.GLM(y_train, sm.add_constant(X_train), family= sm.families.Binomial())

In [None]:
print(lr09.fit().summary())

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# lr09 is our final model, fitting the model
lr09= lr09.fit()

In [None]:
y_train_pred= lr09.predict(sm.add_constant(X_train))

In [None]:
y_train_pred= y_train_pred.values.reshape(-1) 

In [None]:
y_train_pred_final = pd.DataFrame({'output':y_train.values, 'output_Prob':y_train_pred})
y_train_pred_final['ID'] = y_train.index
y_train_pred_final.head()

In [None]:
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.output_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
from sklearn import metrics

In [None]:
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.output, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.output_Prob.map( lambda x: 1 if x > 0.6 else 0)

y_train_pred_final.head()

In [None]:
metrics.accuracy_score(y_train_pred_final.output, y_train_pred_final.final_predicted)

In [None]:
# confusion matrix
confusion1 = metrics.confusion_matrix(y_train_pred_final.output, y_train_pred_final.final_predicted )
confusion1

In [None]:
TP = confusion1[1,1] # true positive 
TN = confusion1[0,0] # true negatives
FP = confusion1[0,1] # false positives
FN = confusion1[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

### Test Dataset

In [None]:
X_test.head()

In [None]:
col= ['age','cp','trtbps','chol','restecg','thalachh','oldpeak','slp','caa','thall']

In [None]:
# scaling the col
X_test[col]= scaler.transform(X_test[col])

In [None]:
X_test.head()

In [None]:
X_train.columns

In [None]:
# retaing the columns which are present in X_train
X_test.drop(['age','trtbps','chol','fbs','restecg','exng','slp','thall'],1,inplace=True)

In [None]:
# Prediction on test dataset using lr09
y_test_pred=lr09.predict(sm.add_constant(X_test))

In [None]:
# converting it to DataFrame
y_test_pred= pd.DataFrame(y_test_pred)
y_test= pd.DataFrame(y_test)
y_test['ID']=y_test.index

In [None]:
y_test_pred.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
y_pred_final=pd.concat([y_test,y_test_pred],1)

In [None]:
y_pred_final.head()

In [None]:
y_pred_final=y_pred_final.rename(columns={0:'output_prob'})

In [None]:
y_pred_final.head()

In [None]:
y_pred_final['final_pred']=y_pred_final.output_prob.map(lambda x: 1 if x>0.6 else 0)

In [None]:
y_pred_final.head()

In [None]:
# calculating the accuracy on test dataset
metrics.accuracy_score(y_pred_final.output, y_pred_final.final_pred)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final.output,y_pred_final.final_pred)

In [None]:
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)