In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(color_codes=True)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_curve, plot_roc_curve
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.rcParams['axes.labelsize'] = 15.
plt.rcParams['xtick.labelsize'] = 15.
plt.rcParams['ytick.labelsize'] = 15.
plt.rcParams['legend.fontsize'] = 15.
plt.rcParams['figure.figsize'] = [15.,8.]

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
sns.countplot(data=df, x="target", palette="magma");

In [None]:
for i in df.columns[:13]:
    sns.histplot(data=df, x=i, palette='magma')
    plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df.age[df.target==0],
           df.cp[df.target==0],
           c='g')
plt.scatter(df.age[df.target==1],
           df.cp[df.target==1],
           c="b")
plt.title("Age vc CP for Heart Disease")
plt.legend(["No Disease","aving Hneart Disease"])
plt.xlabel("Age")
plt.ylabel("Chest Pain");

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df.age[df.target==0],
           df.thalach[df.target==0],
           c="g")
plt.scatter(df.age[df.target==1],
           df.thalach[df.target==1],
           c="b")
plt.xlabel("Age")
plt.ylabel("Thalach")
plt.legend(["No Disease", "Disease"])
plt.title("Age vs Thalach for Heart Disease");

In [None]:
for i in df.columns[:14]:
    sns.boxplot(data=df, x=i)
    plt.show()

In [None]:
outliers_col = ["trestbps", "chol","fbs","thalach", "oldpeak", "ca","thal"]

In [None]:
for i in outliers_col[:]:
    IQR = df[i].quantile(0.75)-df[i].quantile(0.25)
    lower_bridge=df[i].quantile(0.25)-(IQR*1.5)
    upper_bridge=df[i].quantile(0.75)+(IQR*1.5)
    print(f' {i} : Lower Bridge: {lower_bridge}, Upper Bridge: {upper_bridge}')

In [None]:
df.loc[df['trestbps']>=170, 'trestbps']=170
df.loc[df['trestbps']<=90, 'trestbps']=90

df.loc[df['chol']>=369.75, 'chol']=369.75
df.loc[df['chol']<=115.75, 'chol']=115.75

df.loc[df['fbs']>=0, 'fbs']=0
df.loc[df['fbs']<=0, 'fbs']=0

df.loc[df['thalach']>=214.75, 'thalach']=214.75
df.loc[df['thalach']<=84.75, 'thalach']=84.75

df.loc[df['oldpeak']>=4.0, 'oldpeak']=4.0
df.loc[df['oldpeak']<=-2.40, 'oldpeak']=-2.40

df.loc[df['ca']>=2.5, 'ca']=2.5
df.loc[df['ca']<=-1.5, 'ca']=-1.5

df.loc[df['thal']>=4.5, 'thal']=4.5
df.loc[df['thal']<=0.5, 'thal']=0.5

In [None]:
for i in df.columns[:14]:
    sns.boxplot(data=df, x=i)
    plt.show()

### Outliers are removed

In [None]:
sns.pairplot(data=df)

In [None]:
corr = df.corr()
corr

In [None]:
sns.heatmap(corr,
           cmap="Accent",
           annot=True);

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler
standard = StandardScaler()
features_to_scale= [feature for feature in df.columns if feature not in ['target']]
df[features_to_scale] = standard.fit_transform(df[features_to_scale])

In [None]:
df.head()

In [None]:
# train test split
x = df.drop(['target'], axis=1)
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)

In [None]:
len(x_train), len(y_train)

In [None]:
df.shape

## Base Model

### RandomForestClassifier

In [None]:
rf =  RandomForestClassifier().fit(x_train,y_train)
rf.score(x_train,y_train)

In [None]:
rf.score(x_test,y_test)

In [None]:
y_preds = rf.predict(x_test)
accuracy_score(y_test,y_preds)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_preds), annot=True);

In [None]:
acc_cv = cross_val_score(rf, x,y, scoring="accuracy", cv=5, n_jobs=1)

In [None]:
acc_cv.mean()

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_preds))

In [None]:
plot_roc_curve(rf,x_test,y_test)

### Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(x_train,y_train)
lr.score(x_train,y_train)

In [None]:
y_preds1= lr.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_preds1)}')
sns.heatmap(confusion_matrix(y_test,y_preds1), annot=True)
print(classification_report(y_test,y_preds1))
print(plot_roc_curve(lr, x_test,y_test))

### AdaBoostClassifier

In [None]:
ad = AdaBoostClassifier().fit(x_train,y_train)
y_preds2 = ad.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_preds2)}')
sns.heatmap(confusion_matrix(y_test,y_preds2), annot=True)
print(classification_report(y_test,y_preds2))
print(plot_roc_curve(lr, x_test,y_test))

### KNearestNeighbor Classifier

In [None]:
knn = KNeighborsClassifier().fit(x_train,y_train)
y_preds3 = knn.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_preds3)}')
sns.heatmap(confusion_matrix(y_test,y_preds3), annot=True)
print(classification_report(y_test,y_preds3))
print(plot_roc_curve(lr, x_test,y_test))

### RidgeClassifier

In [None]:
knn = RidgeClassifier().fit(x_train,y_train)
y_preds4 = knn.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_preds4)}')
sns.heatmap(confusion_matrix(y_test,y_preds4), annot=True)
print(classification_report(y_test,y_preds4))
print(plot_roc_curve(lr, x_test,y_test))

## Hyperparameter Tuning 

In [None]:
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]
lr_random = RandomizedSearchCV(lr, 
                              param_distributions=param_grid,
                              cv=5,
                              n_iter=100,
                              verbose=2,
                              n_jobs=-1,
                              random_state=42)
lr_random.fit(x_train,y_train)

In [None]:
lr_random.best_params_

In [None]:
lr1_random = lr_random.best_params_

In [None]:
lr1_random

In [None]:
log = LogisticRegression(solver ='newton-cg', penalty='l2',max_iter= 100, C= 0.08858667904100823)

In [None]:
log.fit(x_train,y_train)

In [None]:
log.score(x_train,y_train)

In [None]:
y_pred = log.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
sns.heatmap(confusion_matrix(y_test,y_preds4), annot=True)
print(classification_report(y_test,y_preds4))
print(plot_roc_curve(lr, x_test,y_test))