In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy             as np 
import pandas            as pd 
import matplotlib.pyplot as plt
import seaborn           as sns
import statsmodels.api   as sm
%matplotlib inline

from   sklearn.model_selection   import train_test_split
from   sklearn.linear_model      import LinearRegression
from   sklearn.preprocessing     import MinMaxScaler
from   sklearn.metrics           import confusion_matrix
from   sklearn.metrics           import r2_score,accuracy_score
from   sklearn.feature_selection import RFE
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

1. **Importing Data & Analyzing the Data ****

In [None]:
heart_attack = pd.read_csv(r'../input/heart-attack-analysis-prediction-dataset/heart.csv')
heart_attack.head()

In [None]:
#Analyzing the dataset 
heart_attack.info()
#There are no null values in the dataset & all the datatypes are correctly assigned

**Description of columns in the DATASET**
* Age : Age of the patient
* Sex : Sex of the patient
* exang: exercise induced angina (1 = yes; 0 = no)
* ca: number of major vessels (0-3)
* cp : Chest Pain type chest pain type
*     Value 1: typical angina
*     Value 2: atypical angina
*     Value 3: non-anginal pain
*     Value 4: asymptomatic
* trtbps : resting blood pressure (in mm Hg)
* chol : cholestoral in mg/dl fetched via BMI sensor
* fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
* rest_ecg : resting electrocardiographic results
*     Value 0: normal
*     Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
*     Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
* thalach : maximum heart rate achieved
* target : 0= less chance of heart attack 1= more chance of heart attack

****2. Data cleaning ****

In [None]:
#Checking for outliers if any 
fig, axes = plt.subplots(1, 4, figsize=(14,5))
axes[0].boxplot(heart_attack['age']);
axes[1].boxplot(heart_attack['trtbps']);
axes[2].boxplot(heart_attack['chol']);
axes[3].boxplot(heart_attack['fbs']);

In [None]:
#Function to analyse the columns with outliers 
def outliers(df,x):
    return df[x].quantile([0.25,0.50,0.75,0.90,0.95,0.96,0.97,0.98,0.99,1.00])

In [None]:
#Analysing the suspected columns
print(outliers(heart_attack,'trtbps'))
print(outliers(heart_attack,'chol'))

In [None]:
#dropping the outliers from chol column
print(heart_attack[heart_attack['chol']>400].count())
heart_attack.drop(heart_attack[(heart_attack['chol']>400)].index,axis=0,inplace=True)

In [None]:
#dropping the outliers from 
print(heart_attack[heart_attack['trtbps']>175].count())
heart_attack.drop(heart_attack[(heart_attack['trtbps']>175)].index,axis=0,inplace=True)

In [None]:
 #Analysing the suspected columns
print(outliers(heart_attack,'fbs'))

In [None]:
#dropping the outliers from fbs column
print(heart_attack[heart_attack['fbs']==1].count())
heart_attack.drop(heart_attack[(heart_attack['fbs']==1)].index,axis=0,inplace=True)

In [None]:
#Checking for outliers if any 
fig, axes = plt.subplots(1, 4, figsize=(14,5))
axes[0].boxplot(heart_attack['age']);
axes[1].boxplot(heart_attack['trtbps']);
axes[2].boxplot(heart_attack['chol']);
axes[3].boxplot(heart_attack['fbs']);

#We can observe from the figure that all the outliers have been treated

In [None]:
#Checking for outliers if any 
fig, axes = plt.subplots(1, 4, figsize=(14,5))
axes[1].boxplot(heart_attack['thalachh']);
axes[2].boxplot(heart_attack['oldpeak']);
axes[3].boxplot(heart_attack['slp']);

In [None]:
#dropping the outliers from oldpeak column
print(heart_attack[heart_attack['oldpeak']>6].count())
heart_attack.drop(heart_attack[(heart_attack['oldpeak']>6)].index,axis=0,inplace=True)

In [None]:
sns.boxplot(heart_attack['oldpeak']);

**We can observe that all the outliers have been eliminated & all our data is clean & ready to use**

3. Exploratory Data analysis

In [None]:
#Analysing the correlation between the features 
plt.figure(figsize=(20,10))
sns.heatmap(heart_attack.corr(),annot=True)

#The most correlated values to the Target values are 
1. slp
2. thalachh
3. restecg
4. sex
5. cp
6. exng

In [None]:
#Checking if the data is balanced or imbalanced 
print(heart_attack.shape)
print(heart_attack['output'].value_counts(normalize=True)*100)

**4. Model Building **

In [None]:
#Splitting the data into train test split 
Y = heart_attack['output']
X = heart_attack.drop('output',axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [None]:
#converting Y into dataframe 
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

In [None]:
X_train.head()

In [None]:
#Preprocessing the data 
scaler = MinMaxScaler()
scaling_list = ['age','trtbps','chol','thalachh','oldpeak']
X_train[scaling_list] = scaler.fit_transform(X_train[scaling_list])

In [None]:
X_test[scaling_list] = scaler.transform(X_test[scaling_list])

In [None]:
#Training a model using top-down approach 
lr_model1 = sm.GLM(y_train,sm.add_constant(X_train),families=sm.families.Binomial())
lr_model  = lr_model1.fit()
print(lr_model.summary())

In [None]:
# Running RFE with the output number of the variable equal to 10
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm, 8)             # running RFE
rfe = rfe.fit(X_train, y_train)
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
#Fetching the list of best 8 columns 
col = X_train.columns[rfe.support_]
col

In [None]:
#Building a model using the above features 
X_train_rfe = X_train[col].copy()
lr_model2   = sm.GLM(y_train,sm.add_constant(X_train_rfe),family=sm.families.Binomial())
lr_model    = lr_model2.fit()
print(lr_model.summary())

In [None]:
#Function to predict using latest model & printing the confusion matrix 
def testing(model_name,test_set,thres_value,test_op):
    y_pred_test = model_name.predict(test_set).values.reshape(-1,1)
    y_train_pred_final = pd.DataFrame({'output':test_op, 'op_Prob':y_pred_test})
    y_train_pred_final['index'] = test_set.index
    return y_train_pred_final.head()

In [None]:
#Creating new_testing df
X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
#Function to create a table with pred values for logistic regression 
def prediction(model_name,x_test,y_test):
    y_pred                        = model_name.predict(x_test)
    y_pred_final                  = pd.DataFrame({'op_train_Prob':y_pred})
    y_pred_final['train_op']      = y_test
    y_pred_final['op_train_pred'] = y_pred_final['op_train_Prob'].apply(lambda x:1 if x>0.5 else 0)
    return y_pred_final

In [None]:
prediction(lr_model,X_train_rfe,y_train)

In [None]:
#function to test the logistic Regression model 
def validating_lr(y_real,y_pred):
    from sklearn.metrics import confusion_matrix, accuracy_score
    print('Confusion Matrix')
    confusion = confusion_matrix(y_pred,y_real)
    print(confusion)
    print('Accuracy Score')
    print(accuracy_score(y_pred,y_real)*100)
    TP = confusion[1,1] # true positive 
    TN = confusion[0,0] # true negatives
    FP = confusion[0,1] # false positives
    FN = confusion[1,0] # false negatives
    print('Sensitivity')
    print((TP / float(TP+FN))*100)
    print('specificity')
    print((TN / float(TN+FP))*100)
    print('false postive rate - predicting 1 when its 0')
    print((FP/ float(TN+FP))*100)
    print('Positive predictive value')
    print((TP / float(TP+FP))*100)
    print('Negative predictive value')
    print((TN / float(TN+ FN))*100)

In [None]:
validating_lr(y_pred_final['train_op'],y_pred_final['op_train_pred'])

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_pred_final[i]= y_pred_final['op_train_pred'].map(lambda x: 1 if x > i else 0)
pd.set_option('display.max_rows',None)
y_pred_final

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = confusion_matrix(y_pred_final[i],y_pred_final['train_op'])
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

#As we can see there is no variation in the accuracy even if we change the threshold 
#that means our model is stable & gives 85% accuracy on train data now lets check with test data 

In [None]:
X_test_rfe = X_test[col]
X_test_rfe = sm.add_constant(X_test_rfe)

In [None]:
prediction(lr_model,X_test_rfe,y_test)

In [None]:
#Checking the accuracy of the model
validating_lr(y_pred_final['train_op'],y_pred_final['op_train_pred'])

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = confusion_matrix(y_pred_final[i],y_pred_final['train_op'])
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
#This is my final model as it is giving 