In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

In [None]:
df.head(2)

In [None]:
len(df)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df['output'].unique()

In [None]:
len(df[df['sex']==1])

In [None]:
len(df[df['sex']==0])

# I guess depending on the data 0 means Female and 1 means male. 

In [None]:
df.corr()['output'].sort_values()[:-1]

In [None]:
df.describe()

In [None]:
96/303

# the dataset's target variable distribution is around 70%-30% (70% of all values being 1)

# EDA

In [None]:
sns.histplot(data=df,x='output',kde=True)

In [None]:
sns.countplot(data=df , x = 'cp', hue ='output')

# TEST TRAIN SPLIT

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('output',axis=1)
y = df['output']

In [None]:
X.head()

# creating dummies

In [None]:
df['thall'].nunique()

In [None]:
X_dummies = pd.get_dummies(data=X , columns=['sex','cp','fbs','restecg','exng','slp','caa','thall'] , drop_first=True)

In [None]:
len(X_dummies.columns)

In [None]:
X_dummies.head(2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# LET'S APPLY CLASSIFICATION MODELS AND ASSESS THEIR ACCURCACY/ F1 METRIC

# 1) LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression(fit_intercept=True,random_state=42)

In [None]:
lr.fit(X_train,y_train)

In [None]:
lr_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,f1_score

In [None]:
print(classification_report(y_test,lr_pred))
print('\n')
print(confusion_matrix(y_test,lr_pred))
print('\n')
print(f1_score(y_test,lr_pred))

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
lr_accuracies = cross_val_score(lr , X_test , y_test , cv = 41)
print(lr_accuracies.mean())

# 2) KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=30)

In [None]:
knn.fit(X_train,y_train)

In [None]:
knn_pred = knn.predict(X_test)

In [None]:
print(confusion_matrix(y_test,knn_pred))
print('\n')
print(classification_report(y_test,knn_pred))

In [None]:
Accuracy = []
for i in range(1,41):
    knn = KNeighborsClassifier(i)
    knn.fit(X_train,y_train)
    knn_pred = knn.predict(X_test)
    knn_accuracies = cross_val_score(knn , X_test , y_test , cv = 41)
    knn_accuracy_i = knn_accuracies.mean()
    Accuracy.append(knn_accuracy_i)

In [None]:
knn_df = pd.DataFrame({'Neighbours': np.arange(1,41) , 'Accuracy': Accuracy})

In [None]:
sns.scatterplot(data=knn_df , x = 'Neighbours',y='Accuracy')

In [None]:
knn_df[knn_df['Accuracy']==knn_df['Accuracy'].max()]

# implementing the best knn model with k=14

In [None]:
knn = KNeighborsClassifier(14)
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)
print(confusion_matrix(y_test,knn_pred))
print('\n')
print(classification_report(y_test,knn_pred))

# CV Accuracy

In [None]:
knn_accuracies = cross_val_score(knn , X_test , y_test , cv = 41)
print(knn_accuracies.mean())

# KNN WITH K==14 IS BETTER THAN LOGISTIC REGRESSION WITH A CV ACCURACY OF 86.99%

 # 3) SVC

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()

In [None]:
svc.fit(X_train,y_train)

In [None]:
svc_pred = svc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,svc_pred))
print('\n')
print(classification_report(y_test,svc_pred))

# USING GRID SEARCH CV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'C': [10000,100000,1000000] , 'gamma': [0.000001 , (10**-7) , (10**-8)]}

In [None]:
grid = GridSearchCV(SVC() , param_grid = params , verbose= 3 , cv=10)

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
svc = SVC(C = 1e06 , gamma= 1e-07)

In [None]:
svc.fit(X_train,y_train)

In [None]:
svc_pred = svc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,svc_pred))
print('\n')
print(classification_report(y_test,svc_pred))

In [None]:
svc_accuracies = cross_val_score(svc , X_test , y_test , cv = 41)
print(svc_accuracies.mean())

# CV ON SVC PERFORMED WORST AMONG LOGISTIC REGRESSION AND KNN

# 4) RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=500,random_state=0 , max_depth=5)

In [None]:
rf.fit(X_train,y_train)

In [None]:
rf_pred = rf.predict(X_test)

In [None]:
print(confusion_matrix(y_test,rf_pred))
print('\n')
print(classification_report(y_test,rf_pred))

In [None]:
rf_accuracies = cross_val_score(rf , X_test , y_test , cv = 41)
print(rf_accuracies.mean())

# RANDOM FOREST ALMOST LIKE LOGISTIC REGRESSION (LOGISTIC REGRESSION 4 MORE CORRECT VALUES THAN RF)..TILL NOW KNN SEEMS TO BE THE BEST FIT

# 4) CATBOOST

In [None]:
from catboost import CatBoostClassifier

In [None]:
cb = CatBoostClassifier(iterations=1000 , random_state=0 , loss_function='Logloss' , depth=5)

In [None]:
cb.fit(X_train,y_train , eval_set=(X_test,y_test) , plot=True)

In [None]:
cb_pred = cb.predict(X_test)

In [None]:
print(confusion_matrix(y_test,cb_pred))
print('\n')
print(classification_report(y_test,cb_pred))

In [None]:
cb_accuracies = cross_val_score(cb , X_test , y_test , cv = 40)
print(cb_accuracies.mean())

# SURPRISINGLY CATBOOST PERFORMED WORST OF ALL MODELS WITH ONLY 78.33% CV ACCURACY

# HENCE, THE BEST MODEL IS KNN

In [None]:
knn = KNeighborsClassifier(14)
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)
print(confusion_matrix(y_test,knn_pred))
print('\n')
print(classification_report(y_test,knn_pred))

In [None]:
knn_accuracies = cross_val_score(knn , X_test , y_test , cv = 41)
print(knn_accuracies.mean())

# MEAN ACCURACY IS 86.99% OUT OF 41 CV FOLDS