# IMPORT LIBRARIES 

NOTE: My first kaggle project. I used inspiration from already existed notebooks.

In [None]:

import pandas as pd
import numpy as np
#visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# data preprocessing
from sklearn.preprocessing import StandardScaler
# data splitting
from sklearn.model_selection import train_test_split
# data modeling
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


# IMPORT DATA

In [None]:
df= pd.read_csv('../input/health-care-data-set-on-heart-attack-possibility/heart.csv')
df.head(10)

In [None]:
df.describe()

In [None]:
pp.ProfileReport(df)

# DATA WRAGLING  

In [None]:
df.shape

In [None]:
df.isnull().sum() #checking the null values

In [None]:
#Using heatmap to display 0 null values
plt.figure(figsize=(10,10))
sns.heatmap(df.isnull(),yticklabels=False, cmap ='mako')


#Pandas dataframe.corr() is used to find the pairwise correlation of all columns in the dataframe. Any na values are automatically excluded. For any non-numeric data type columns in the dataframe it is ignored.
* +ve values show +ve relation
* -ve values show -ve relation 
* 0 shows no relation
* Any na values are automatically excluded. For any non-numeric data type columns in the dataframe it is ignored.
* relation with itself is 1


In [None]:
df.corr() #find the correlation 

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),cmap='mako')

In [None]:
df[df.target ==1]['age'].sort_values()
#we can see the risk starts at the age of 29 

# DATA VISUALIZATION


In [None]:
sns.distplot(df['age'], kde=False)
#we can see the trends in the age 

In [None]:
sns.boxplot(x='sex',y='age',data=df,palette="Set1")
#we can see men i.e 0 are older than women in the data set


In [None]:
#women has more target to heart attack
sns.countplot(x='sex', hue='target',data=df, palette="Set1")

In [None]:
sns.countplot(x='fbs',hue='target',data=df,palette="Set1")
#Fasting blood sugar> 120 mg/dl

In [None]:
sns.countplot(x='restecg',hue='target',data=df,palette="Set1")
# resting electrocardiographic results (values 0,1,2)

In [None]:
sns.countplot(x='exang',hue='target',data=df,palette="Set1")
#exercise induced angina (1 = yes; 0 = no) vs target

In [None]:
#chestpain vs target
sns.countplot(x='cp',hue='target',data=df,palette="Set1")


# DATA PREP FOR MODELLING

In [None]:
y = df["target"]
X = df.drop('target',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [None]:
X_train.shape

Checking here if the data is equally splitted or not

In [None]:
from collections import Counter
print(y_test.unique())
Counter(y_train)

StandardScaler standardizes a feature by subtracting the mean and then scaling to unit variance

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#MODEL 1 Logistic Regression

In [None]:
model1 = 'Logistic Regression'

In [None]:
logmodel=LogisticRegression()


In [None]:
model = logmodel.fit(X_train, y_train)
lr_predict = logmodel.predict(X_test)




In [None]:
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
print("confussion matrix")
print(lr_conf_matrix)

In [None]:
lr_acc_score = accuracy_score(y_test, lr_predict)
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')

In [None]:
print(classification_report(y_test,lr_predict))

In [None]:
new_df= pd.DataFrame({'Actual':y_test,
                   'Predicted':lr_predict})

In [None]:
new_df

In [None]:
#MODEL 2 Random Forest 

In [None]:
model2 = 'Random Forest'
classifier= RandomForestClassifier(n_estimators=30, criterion = 'entropy',random_state=12)
classifier.fit(X_train,y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
rf_conf_matrix = confusion_matrix(y_test, y_pred)
print("confussion matrix")
print(rf_conf_matrix)


In [None]:
rf_acc_score = accuracy_score(y_test,  y_pred)
print("Accuracy of Random Forest :",rf_acc_score*100,'\n')


In [None]:
print(classification_report(y_test,y_pred))

In [None]:
df.head(1)

In [None]:
#Testing
print(classifier.predict(scaler.transform([[63,1,3,145,233,1,0,150,0,2.3,0,0,1]])))

In [None]:
new_df2=pd.DataFrame({'Actual':y_test,
                   'Predicted':y_pred})
new_df2

#  Model Evaluation

In [None]:
model_res = pd.DataFrame({'Model':[model1,model2],'Accuracy' : [lr_acc_score*100,rf_acc_score*100]})
model_res

In [None]:
colors = ['red','blue']
plt.figure(figsize=(10,5))
plt.title("Accuracy comparison")
plt.xlabel("Accuracy")
plt.ylabel("Algorithms")
plt.bar(model_res['Model'],model_res['Accuracy'],color = colors)
plt.show()

# Conclusion
- Random Forest Classification gives more accurate result
- The risk starts at the age of 29
- Women has more risk of heart attack