In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Objective**

With the dataset provided for heart analysis, we have to analyse the possibilities of heart attack under various factors/features, and then a prediction from the classification will tell us that if a person is prone to heart attack or not. Th detailed analysis can be proceed with the exploratory data analysis (EDA). The classification for predication can be done using various machine learning model algorithms, choose the best suited model for heart attack analysis and finally save the model in the pickle (.pkl) file.


**About the dataset**

Columns in the dataset are defined as :-

1. Age : Age of the patient

2. Sex : Sex of the patient (1 = male; 0 = female)

3. exng : exercise induced angina (1 = yes; 0 = no)

4. caa : number of major vessels (0-3) colored by flourosopy.

5. cp : Chest Pain type chest pain type

     Value 1: typical angina

     Value 2: atypical angina

     Value 3: non-anginal pain

     Value 4: asymptomatic
     
6. trtbps : resting blood pressure (in mm Hg)

7. chol : cholestoral in mg/dl fetched via BMI sensor

8. fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

9. restecg : resting electrocardiographic results

     Value 0: normal

     Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

     Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

10. oldpeak : ST depression induced by exercise relative to rest.

11. slp : the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)

12. thal : 3 = normal; 6 = fixed defect; 7 = reversable defect.

13. thalachh : maximum heart rate achieved

14. output :

    0= less chance of heart attack 

    1= more chance of heart attack
    
 
**Project Data**

The data was taken the kaggle dataset. We have been provided with the two dataset, that are -

    1. heart.csv : Stored the details of the various parameters required heart analysis
    2. o2Saturation.csv : Stored the details of the oxygen (o2) saturation level


**THE STEPS TO BE FOLLOWED --**

0. Any required installations
1. Data Sourcing and Understanding
2. Data Cleaning
3. Data Visualisation(EDA)
4. Data Preparation
5. Model Building
6. Model Evaluation
7. Recommendations/Conclusion

In [None]:
#REQUIRED INSTALLATION
#pip install pandas-profiling
#pip install plotly

**DATA SOURCING AND UNDERSTANDING**

In [None]:
#to avoid warnings
import warnings
warnings.filterwarnings('ignore')

#os
import os

#linear algebra libraries
import numpy as np, pandas as pd
import pandas_profiling as pp

#libraries for plotting graphs
import matplotlib.pyplot as plt, seaborn as sns, matplotlib
import plotly.express as px
import plotly.figure_factory as ff

#for model building
from sklearn.preprocessing import scale, StandardScaler
from sklearn import linear_model
from sklearn import metrics
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, mean_squared_error as MSE

In [None]:
#heart analysis dataset
heart_df = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")
heart_df.head(4)

In [None]:
#o2 saturation level dataset
saturation = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/o2Saturation.csv")
saturation.head(4)

In [None]:
#checking profile summary of heart attack dataset
pp.ProfileReport(heart_df)

In [None]:
#getting all types of info from dataset
heart_df.info()

In [None]:
#checking for null values
heart_df.isnull().sum()

In [None]:
#decribing columns
heart_df.describe()

In [None]:
#separating categorical columns
cat_heart = heart_df[['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall']]

#separating continuous columns
conti_heart = heart_df[['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']]

In [None]:
#decribing continuous columns
conti_heart.describe()

In [None]:
#value counts for the categorical columns

sex_count = heart_df['sex'].value_counts()
cp_count = heart_df['cp'].value_counts()
fbs_count = heart_df['fbs'].value_counts()
restecg_count = heart_df['restecg'].value_counts()
slp_count = heart_df['slp'].value_counts()
exng_count = heart_df['exng'].value_counts()
caa_count = heart_df['caa'].value_counts()
thall_count = heart_df['thall'].value_counts()

#printing all the values
print("sex_count :\n", sex_count)
print("cp_count :\n", cp_count)
print("fbs_count :\n", fbs_count)
print("restecg_count :\n", restecg_count)
print("slp_count :\n", slp_count)
print("exng_count :\n", exng_count)
print("caa_count :\n", caa_count)
print("thall_count :\n", thall_count)

**DATA VISUALISATION (EDA)**

**Univariate Analysis**

In [None]:
def distplot_check(column):
    plt.title('Checking Outliers with distplot()')
    sns.distplot(column, bins=10)
    plt.show()

In [None]:
#resting blood pressure (in mm Hg)
distplot_check(heart_df['trtbps'])

In [None]:
#cholestoral in mg/dl fetched via BMI sensor
distplot_check(heart_df['chol'])

In [None]:
#maximum heart rate achieved
distplot_check(heart_df['thalachh'])

In [None]:
#ST depression induced by exercise relative to rest
distplot_check(heart_df['oldpeak'])

These features are normally distributed!!

In [None]:
#analysis of the output variable : how much chance of heart attack
ax = sns.countplot(data = heart_df,  x = 'output', palette = ['#85bfdc','#f64c72'])
ax.set(xticklabels = ['less chance of heart attack', 'more chance of heart attack'], title = "Target Distribution")
ax.tick_params(bottom = False)

In [None]:
#analysis of the age variable : which age has the chance of heart attack
fig = px.histogram(heart_df, x="age", color="output", marginal="box", 
                   hover_data = heart_df.columns, color_discrete_sequence=['#f64c72','#85bfdc'])

#layout
fig.update_layout(
    title = "Heart attack chance corresponding to age"
)

#plot
fig.show()

#index
print("*1 : high chance of heart attack\n*0 : low chance of heart attack")

In [None]:
#analysis of the sex variable : which sex has the chance of heart attack
ax = sns.countplot(data = heart_df, x = 'sex', hue = 'output')
ax.set(xticklabels = ['female', 'male'], title = "Heart attack chance corresponding to Gender")
ax.tick_params(bottom = False)

#index
print("Output if --\n*1 : high chance of heart attack\n*0 : low chance of heart attack")

**Bivariate Analysis**

In [None]:
def bivariate(column):
    
    #if 1 : high chance of heart attack
    high = heart_df[heart_df['output']==1][column]
    #else 0 : low chance of heart attack
    low = heart_df[heart_df['output']==0][column]
    
    #plotting
    fig = ff.create_distplot([high, low],
                             ['more chance of heart attack', 'less chance of heart attack'], 
                             show_hist=False, colors=['#f64c72', '#85bfdc'])
    
    #getting layout
    fig.update_layout(
        title = "Heart Attack chance corresponding to " + column,
        xaxis_title = 'Feature Variable : ' + column,
    )

    #show plot
    fig.show()

*OUTPUT VS TRTBPS*

In [None]:
print("Analysis of trtbps variable : how blood pressure is related to the chance of heart attack")
bivariate('trtbps')

*OUTPUT VS CHOL*

In [None]:
print("Analysis of chol variable : how cholestoral is related to the chance of heart attack")
bivariate('chol')

*OUTPUT VS THALACHH*

In [None]:
print("Analysis of thalachh variable : how maximum heart rate achieved is related to the chance of heart attack")
bivariate('thalachh')

*OUTPUT VS OLDPEAK*

In [None]:
print("Analysis of oldepeak variable : how ST depression induced by exercise relative to rest is related to the chance of heart attack")
bivariate('oldpeak')

**Heatmap**

*CONTINUOUS VARIABLE*

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6,6))
df_cor = conti_heart.corr()

half = np.triu(np.ones_like(df_cor, dtype=np.bool))

my_colors = ['#85bfdc','#f64c72']
cmap = matplotlib.colors.LinearSegmentedColormap.from_list('Custom', my_colors)

heatmap = sns.heatmap(df_cor, 
            square=True, 
            mask=half,
            linewidth=2.5, 
            vmax=0.4, vmin=0, 
            cmap=cmap, 
            cbar=False, 
            ax=ax,annot=True)

heatmap.set(title="Heatmap of continous variables")
heatmap.set_yticklabels(heatmap.get_xticklabels(), rotation = 0)
heatmap.spines['top'].set_visible(True)

plt.tight_layout()

*CATEGORICAL VARIABLES*

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6,6))
df_cor = cat_heart.corr()

half = np.triu(np.ones_like(df_cor, dtype=np.bool))

my_colors = ['#85bfdc','#f64c72']
cmap = matplotlib.colors.LinearSegmentedColormap.from_list('Custom', my_colors)

heatmap = sns.heatmap(df_cor, 
            square=True, 
            mask=half,
            linewidth=2.5, 
            vmax=0.4, vmin=0, 
            cmap=cmap, 
            cbar=False, 
            ax=ax,annot=True)

heatmap.set(title="Heatmap of categorical variables")
heatmap.set_yticklabels(heatmap.get_xticklabels(), rotation = 0)
heatmap.spines['top'].set_visible(True)

plt.tight_layout()

*HEART ANALYSIS VIA HEATMAP*

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,10))
df_cor = heart_df.corr()

half = np.triu(np.ones_like(df_cor, dtype=np.bool))

my_colors = ['#85bfdc','#f64c72']
cmap = matplotlib.colors.LinearSegmentedColormap.from_list('Custom', my_colors)

heatmap = sns.heatmap(df_cor, 
            square=True, 
            mask=half,
            linewidth=2.5, 
            vmax=0.4, vmin=0, 
            cmap=cmap, 
            cbar=False, 
            ax=ax,annot=True)

heatmap.set(title="Heatmap of all variables in the heart dataset")
heatmap.set_yticklabels(heatmap.get_xticklabels(), rotation = 0)
heatmap.spines['top'].set_visible(True)

plt.tight_layout()

In [None]:
#correlation matrix more clearly among the variables
cluster_map = sns.clustermap(df_cor.corr(), annot=True)

plt.title("CLUSTERMAP of all variables in the HEART ANALYSIS\n\n")
plt.tight_layout()

**Multivariate Analysis**

In [None]:
#thalachh vs chol vs cp vs sex WITH RESPECT TO output
fig = px.scatter(heart_df,
    x='thalachh',
    y= 'chol',
    color='output',
    facet_col='cp', 
    facet_row='sex',
    color_discrete_sequence=['#f64c72','#85bfdc'], 
)

fig.show()

In [None]:
#thalachh vs chol vs restecg vs sex WITH RESPECT TO output
fig = px.scatter(heart_df,
    x='thalachh',
    y= 'chol',
    color='output',
    facet_col='restecg', 
    facet_row='sex',
    color_discrete_sequence=['#f64c72','#85bfdc'], 
)

fig.show()

In [None]:
#thalachh vs chol vs ca vs sex WITH RESPECT TO output
fig = px.scatter(heart_df,
    x='thalachh',
    y= 'chol',
    color='output',
    facet_col='caa', 
    facet_row='sex',
    color_discrete_sequence=['#f64c72','#85bfdc'], 
)

fig.show()

In [None]:
#thalachh vs chol vs exng vs sex WITH RESPECT TO output
fig = px.scatter(heart_df,
    x='thalachh',
    y= 'chol',
    color='output',
    facet_col='exng', 
    facet_row='sex',
    color_discrete_sequence=['#f64c72','#85bfdc'], 
)

fig.show()

In [None]:
#thalachh vs chol vs fbs vs sex WITH RESPECT TO output
fig = px.scatter(heart_df,
    x='thalachh',
    y= 'chol',
    color='output',
    facet_col='fbs', 
    facet_row='sex',
    color_discrete_sequence=['#f64c72','#85bfdc'], 
)

fig.show()

**DATA PREPARATION**

In [None]:
#mapping the categorical columns
heart_df['cp'] = heart_df['cp'].map({0:'asymptomatic', 1:'atypical angina', 2:'non-anginal pain' , 3:'typical angina'})
heart_df['restecg'] = heart_df['restecg'].map({0:'left ventricular hypertrophy', 1:'normal', 2:'ST-T wave abnormality'})
heart_df['thall'] = heart_df['thall'].map({1:'fixed defect', 2:'normal', 3:'reversable defect', 0:'nothing'})

In [None]:
heart_df.head(3)

In [None]:
#getting dummy variables
heart_data = pd.get_dummies(heart_df, drop_first=False)
heart_data.columns

Since one hot encoding dropped "thall_fixed defect" column which was a useful column compared to 'thall_nothing' which is a null column, we dropped 'thall_nothing' and concatinated 'thall_fixed defect'.

In [None]:
#store the required column
temp_df = heart_data['thall_fixed defect']

#getting dummy variables
heart_data = pd.get_dummies(heart_df, drop_first=True)
heart_data.head(4)

In [None]:
#checking the thall_nothing value_counts
heart_data['thall_nothing'].value_counts()

This shows that it is almost null values!!! Getting the useful column thall_fixed defect.

In [None]:
#merge the two dataframe
merge_df = [heart_data, temp_df]

#concatenate the data
heart_final = pd.concat(merge_df, axis=1)

#drop the null column
heart_final.drop('thall_nothing', axis=1, inplace=True)

#dataframe
heart_final.head()

In [None]:
#checking columns
heart_final.columns

In [None]:
#shape of the dataset
heart_final.shape

**MODEL BUILDING**

**Feature Engineering**

In [None]:
X = heart_final.drop('output', axis = 1)
y = heart_final['output']

In [None]:
X.columns

In [None]:
print(X.shape)
print(y.shape)

**Train-Test Split**

In [None]:
#80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
#normalize the X variable data
X_train = (X_train - np.min(X_train)) / (np.max(X_train) - np.min(X_train)).values
X_test = (X_test - np.min(X_test)) / (np.max(X_test) - np.min(X_test)).values

In [None]:
#shape of train sets
print(X_train.shape)
print(y_train.shape)

#shape of test sets
print(X_test.shape)
print(y_test.shape)

In [None]:
#y_train
print("y_train :")
print("counts of label 1")
train_label1 = sum(y_train==1)
print(train_label1)

print("counts of label 0")
train_label0 = sum(y_train==0)
print(train_label0)

In [None]:
#y_test
print("y_test :")
print("counts of label 1")
test_label1 = sum(y_test==1)
print(test_label1)

print("counts of label 0")
test_label0 = sum(y_test==0)
print(test_label0)

**Principal Component Analysis (PCA)**

In [None]:
#create pipeline
pca = Pipeline([('scaler', StandardScaler()), ('pca', PCA())])

#fitting pca
pca.fit(X_train)
heart_pca = pca.fit_transform(X_train)

#pca model extraction
pca = pca.named_steps['pca']

#explainded variance of PCA components
print(pd.Series(np.round(pca.explained_variance_ratio_.cumsum(), 4)*100))

Here, we can say that,

1. 4 : 99% variance
2. 14 : 95% variance

**Logistic Regression with RFE**

In [None]:
#First Training Model

#Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
#Feature Selection Using RFE
logreg = LogisticRegression()

#running RFE with 12 variables
rfe = RFE(logreg, 12)

#fit rfe
rfe = rfe.fit(X_train, y_train)

#rfe support variables
rfe.support_

In [None]:
#listing support an ranking together
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
#columns not supported by rfe
X_train.columns[~rfe.support_]

In [None]:
#model with StatsModels
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
#predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
#getting predicted values
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

*Correlation Matrix*

In [None]:
#figure size
plt.figure(figsize = (20,10))

#heatmap for train data
sns.heatmap(X_train[col].corr(),annot = True)

In [None]:
#figure size
plt.figure(figsize = (20,10))

#heatmap for test data
sns.heatmap(X_test[col].corr(),annot = True)

**Logistic Regression with PCA**

In [None]:
#creating PCA class
pca = PCA(svd_solver='randomized', random_state=42)

#pca fitting on train data
pca.fit(X_train)

In [None]:
#getting pca components
pc = pca.components_

#listing all the columns of X_train together
col_names = list(X_train.columns)

#finding top 10 pca components
pca_features = pd.DataFrame({'PC1':pc[0],'PC2':pc[1],'PC3':pc[2],'PC4':pc[3],'PC5':pc[4],
                             'PC6':pc[5],'PC7':pc[6],'PC8':pc[7],'PC9':pc[8],'PC10':pc[9], 
                            'Features':col_names})

pca_features.head(10)

**Cummulative Explained Variance**

In [None]:
%matplotlib inline
fig = plt.figure(figsize=(10,5))

cum_sum = np.cumsum(pca.explained_variance_ratio_)
plt.plot(cum_sum)

#labels
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

plt.show()

In [None]:
#creating pipeline
PCA_VARS = 18
steps = [('scaler', StandardScaler()),
         ("pca", PCA(n_components=PCA_VARS)),
         ("logistic", LogisticRegression(class_weight='balanced'))
        ]
pipeline = Pipeline(steps)

In [None]:
#train data

#fit pipeline model
pipeline.fit(X_train, y_train)

#checking score on train data
pipeline.score(X_train, y_train)

**Test Data Evaluation**

In [None]:
#churn prediction on test data
y_pred = pipeline.predict(X_test)

#confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

TP = cm[1,1] # true positive 
TN = cm[0,0] # true negatives
FP = cm[0,1] # false positives
FN = cm[1,0] # false negatives

#Let's see the sensitivity
print('\nSensitivity: ', TP / float(TP+FN))

#Let us calculate specificity
print('Specificity: ',TN / float(TN+FP))

#area under curve (AUC)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
print("\n AUC: ", round(roc_auc_score(y_test, y_pred_prob),2))

**Hyperparameter tuning**

In [None]:
#identifying class imbalance
y_train.value_counts()/y_train.shape

In [None]:
#pca = PCA()

#logistic Regression with class_weight parameter
logistic = LogisticRegression(class_weight='balanced')

#creating pipeline
steps = [("scaler", StandardScaler()), ("pca", pca),("logistic", logistic)]

#pipeline
pca_logistic = Pipeline(steps)

#hyperparameter
params = {'pca__n_components': [14, 4], 'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']}

#create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

#gridsearch object
model = GridSearchCV(estimator=pca_logistic, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
#fit model
model.fit(X_train, y_train)

In [None]:
#print best hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
# cross validation results
pd.DataFrame(model.cv_results_)

In [None]:
# predict churn on test data
y_pred = model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

TP = cm[1,1] # true positive 
TN = cm[0,0] # true negatives
FP = cm[0,1] # false positives
FN = cm[1,0] # false negatives

#Let's see the sensitivity
print('\nSensitivity: ', TP / float(TP+FN))

#Let us calculate specificity
print('Specificity: ',TN / float(TN+FP))

# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("\nAUC: ", round(roc_auc_score(y_test, y_pred_prob),2))

*Hyperparameter tuning doesn't helped us.The accuracy remained as it is.*

Let's go for other algorithm, like Random Forest.

**RANDOM FOREST with PCA**

In [None]:
# random forest classifier
forest = RandomForestClassifier(class_weight='balanced', n_jobs = -1)

#hyperparameter
params = {"criterion": ['gini', 'entropy'], "max_features": ['auto', 0.4]}

#create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

#gridsearch object
model = GridSearchCV(estimator=forest, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

#fit model
model.fit(X_train, y_train)

In [None]:
#best hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
#churn prediction on test data
y_pred = model.predict(X_test)

#confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

TP = cm[1,1] # true positive 
TN = cm[0,0] # true negatives
FP = cm[0,1] # false positives
FN = cm[1,0] # false negatives

#sensitivity
print('\nSensitivity: ', TP / float(TP+FN))

#specificity
print('Specificity: ',TN / float(TN+FP))

#area under curve (AUC)
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("\nAUC: ", round(roc_auc_score(y_test, y_pred_prob),2))

**Hyperparameter Tuning**

In [None]:
#specify number of folds for k-fold CV
n_folds = 5

#parameters
parameters = {'max_depth': range(2, 20, 5)}

#random forest classifier
rf_hyper = RandomForestClassifier(class_weight='balanced')


# fit tree on training data
rf_hyper = GridSearchCV(rf_hyper, parameters, 
                    cv=n_folds, return_train_score=True,
                   scoring="accuracy", n_jobs=-1)

#fit rf model
rf_hyper.fit(X_train, y_train)

In [None]:
#GridSearch CV scores
scores = rf_hyper.cv_results_
pd.DataFrame(scores).head()

In [None]:
#plot accuracies with min_samples_split
plt.figure()

#training accuracy
plt.plot(scores["param_max_depth"], 
         scores["mean_train_score"], 
         label="training accuracy")

#test accuracy
plt.plot(scores["param_max_depth"], 
         scores["mean_test_score"], 
         label="test accuracy")

#labels
plt.xlabel("max_depth")
plt.ylabel("Accuracy")


plt.legend()
plt.show()

In [None]:
#specify number of folds for k-fold CV
n_folds = 5

#min_samples_leaf parameter
parameters = {'min_samples_leaf': range(100, 400, 50)}

#random forest classifier
rf_hyper = RandomForestClassifier(class_weight='balanced')


#fit tree on training data
rf_hyper = GridSearchCV(rf_hyper, parameters, 
                    cv=n_folds, return_train_score=True,
                   scoring="accuracy", n_jobs=-1)

rf_hyper.fit(X_train, y_train)

In [None]:
#GridSearch CV scores
scores = rf_hyper.cv_results_
pd.DataFrame(scores).head()

In [None]:
#plot accuracies with min_samples_split
plt.figure()

#training accuracy
plt.plot(scores["param_min_samples_leaf"], 
         scores["mean_train_score"], 
         label="training accuracy")

#test accuracy
plt.plot(scores["param_min_samples_leaf"], 
         scores["mean_test_score"], 
         label="test accuracy")

#labels
plt.xlabel("min_samples_leaf")
plt.ylabel("Accuracy")


plt.legend()
plt.show()

In [None]:
print(rf_hyper.best_score_)
print(rf_hyper.best_params_)

In [None]:
#hyperparameter model
rf_final = RandomForestClassifier(bootstrap=True, class_weight='balanced',
                                              criterion='gini', max_depth=10,
                                              min_samples_leaf=5,
                                              min_samples_split=5,
                                              n_estimators=100)

#fit
rf_final.fit(X_train,y_train)

In [None]:
#prediction
y_pred_default = rf_final.predict(X_test)

#classification report
print(classification_report(y_test,y_pred_default))

#confusion matrix
print(confusion_matrix(y_test,y_pred_default))

#accuracy
print('accuracy_score: ',accuracy_score(y_test,y_pred_default))

In [None]:
#Confusion Matrix
confusion_rf_hyper=confusion_matrix(y_test,y_pred_default)
confusion_rf_hyper

In [None]:
TN = confusion_rf_hyper[0,0] # true positive 
TP = confusion_rf_hyper[1,1] # true negatives
FP = confusion_rf_hyper[0,1] # false positives
FN = confusion_rf_hyper[1,0] # false negatives

print('Accuracy Score: ',accuracy_score(y_test,y_pred_default))

#sensitivity
print('Sensitivity: ', TP / float(TP+FN))

#specificity
print('Specificity: ',TN / float(TN+FP))

SUMMARY OF TWO MODELS :

1. Logistic Regression --

    Sensitivity:  0.7941176470588235
    
    Specificity:  0.8888888888888888
    
    AUC:  0.92

2. Random Forest -- 

    Accuracy Score:  0.819672131147541
    
    Sensitivity:  0.7941176470588235
    
    Specificity:  0.8518518518518519

**MODEL EVALUATION**

**Choosing Best Model -- LOGISTIC REGRESSION**

In [None]:
#number of features consider to split each node
max_features = int(round(np.sqrt(X_train.shape[1])))
print(max_features)

In [None]:
#creating PCA class
pca = PCA(svd_solver='randomized', random_state=42)

#pca fitting on train data
pca.fit(X_train)

In [None]:
#getting pca components
pc = pca.components_

#listing all the columns of X_train together
col_names = list(X_train.columns)

#finding top 10 pca components
pca_features = pd.DataFrame({'PC1':pc[0],'PC2':pc[1],'PC3':pc[2],'PC4':pc[3],'PC5':pc[4],
                             'PC6':pc[5],'PC7':pc[6],'PC8':pc[7],'PC9':pc[8],'PC10':pc[9], 
                            'Features':col_names})

#creating pipeline
PCA_VARS = 18
steps = [('scaler', StandardScaler()),
         ("pca", PCA(n_components=PCA_VARS)),
         ("logistic", LogisticRegression(class_weight='balanced'))
        ]
pipeline = Pipeline(steps)



#train data

#fit pipeline model
pipeline.fit(X_train, y_train)

#checking score on train data
pipeline.score(X_train, y_train)

**Test data evaluation**

In [None]:
#churn prediction on test data
y_pred = pipeline.predict(X_test)

#confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

TP = cm[1,1] # true positive 
TN = cm[0,0] # true negatives
FP = cm[0,1] # false positives
FN = cm[1,0] # false negatives

#Let's see the sensitivity
print('\nSensitivity: ', TP / float(TP+FN))

#Let us calculate specificity
print('Specificity: ',TN / float(TN+FP))

#area under curve (AUC)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
print("\n AUC: ", round(roc_auc_score(y_test, y_pred_prob),2))

**Hyperparameter Tuning**

In [None]:
#identifying class imbalance
y_train.value_counts()/y_train.shape

In [None]:
#pca = PCA()

#logistic Regression with class_weight parameter
logistic = LogisticRegression(class_weight='balanced')

#creating pipeline
steps = [("scaler", StandardScaler()), ("pca", pca),("logistic", logistic)]

#pipeline
pca_logistic = Pipeline(steps)

#hyperparameter
params = {'pca__n_components': [14, 4], 'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']}

#create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

#gridsearch object
model = GridSearchCV(estimator=pca_logistic, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
#fit model
model.fit(X_train, y_train)

In [None]:
#print best hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
# predict on test data
y_pred = model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

TP = cm[1,1] # true positive 
TN = cm[0,0] # true negatives
FP = cm[0,1] # false positives
FN = cm[1,0] # false negatives

#Let's see the sensitivity
print('\nSensitivity: ', TP / float(TP+FN))

#Let us calculate specificity
print('Specificity: ',TN / float(TN+FP))

# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("\nAUC: ", round(roc_auc_score(y_test, y_pred_prob),2))

**ROC Curve**

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr,tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Heart disease classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

**Heatmap for predictions**

In [None]:
#heatmap of prediction
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)

**Saving the model**

In [None]:
#library to save model
import pickle
  
#Save the trained model as a pickle string.
saved_model = pickle.dumps(model)

In [None]:
# Load the pickled model
logis_from_pickle = pickle.loads(saved_model)
  
# Use the loaded pickled model to make predictions
logis_from_pickle.predict(X_test)

**RECOMMENDATIONS**

For model building, we can use logistic regression for predicting the unseen data, as this model produces 92% accuracy, that is best among the other model predicted.

**CONCLUSION**

We can conclude our code from the following insights ---

1. Numeric Variables - No outliers were found!

2. In the count of target showed up that we have more chance of heart attack details.

3. Age from 40-60 years have the high chance of heart attack.

4. Male gender has more chance of heart attack compared to female ones.

5. High Blood Pressure, High Cholestrol and High Heart Rate leads to high chance of heart attack.

6. Highly Correlated factors in this dataset are :

        ** Age and trtbps (blood pressure rate)

        ** Age and chol (cholestrol level)

**--------------- by Sakshi Maharana -----------------------**

**PS. Reviews, Comments, Discussion and Feedbacks are welcomed. This code was to focus on the EDA and PCA model building. Hope you liked it!! Upvotes for my first kaggle kernel code.**

**THANK YOU!!**