In [None]:
# importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## EDA

In [None]:
# importing data
df = pd.read_csv("../input/heart-disease-uci/heart.csv")
df.describe()
plt.scatter(df.age[df.target == 1], df.thalach[df.target ==1], c ="salmon");

plt.scatter(df.age[df.target == 0], df.thalach[df.target == 0], c= "lightblue");

plt.title( "Heart disease age and max heart rate co-relation")
plt.xlabel("Age")
plt.ylabel("Max Heart rate");
plt.legend([1,0]);


## checking coorelation with features and labels

In [None]:
corr =df.corr()
fig, ax =plt.subplots(figsize =(16,8))
ax  = sns.heatmap(corr,annot =True,fmt = ".2f", cmap = 'gray_r');

In [None]:
# splitting data
X = df.drop("target", axis =1)
Y = df["target"]
X.corrwith(df.target).plot(kind='bar',
                           grid=True, 
                           figsize=(12, 8),
                           title="Correlation with target");


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

models = {"log": LogisticRegression(max_iter = 1001), 
          "knn": KNeighborsClassifier(), 
          "rfc": RandomForestClassifier()}

def fit_and_score(models, x_train, x_test, y_train, y_test):
    
    np.random.seed(17)
    model_scores = {}
    
    for name, model in models.items():
        model.fit(x_train, y_train)
        
        model_scores[name] = model.score(x_test, y_test)

    return model_scores

In [None]:
model_scores = fit_and_score(models, x_train, x_test, y_train, y_test)

model_scores

In [None]:
model_comp = pd.DataFrame(model_scores, index = ["accuracy"])
model_comp.T.plot.bar();

In [None]:
train_scores = []
test_scores = []
neighbors = range(1,21)


knn = KNeighborsClassifier()

for i in neighbors:
    knn.set_params(n_neighbors = i)
    knn.fit(x_train, y_train)
    train_scores.append(knn.score(x_train, y_train))
        
    test_scores.append(knn.score(x_test, y_test))

plt.plot(neighbors, train_scores, label = "train scores")
plt.plot(neighbors, test_scores, label="test scores")
plt.xlabel("no. of neighbors")
plt.ylabel("model_score")
plt.legend();

print(f"maximum accuracy: {max(test_scores)*100:.2f}%");

## **using RandmoizedCV to find better parameters**

In [None]:
LR_grid = {"C": np.logspace(-4, 4, 20),
           "solver": ["liblinear"]}

RF_grid = {"n_estimators" : [1, 200,500,1000],
          "max_depth": [None, 3, 5, 10],
          "min_samples_split": np.arange(2, 20, 2),
          "min_samples_leaf": np.arange(1, 20, 2)}

np.random.seed(17)
LR = RandomizedSearchCV(LogisticRegression(),
                           param_distributions = LR_grid,
                           cv =5,
                           n_iter = 10,
                           verbose =True)

RF = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions = RF_grid,
                           cv =5,
                           n_iter = 10,
                           verbose = True)

LR.fit(x_train, y_train).best_params_, RF.fit(x_train, y_train).best_params_

In [None]:
LR.score(x_test, y_test), RF.score(x_test, y_test)

## GridsearchCV

In [None]:
LGCV = {"C": np.logspace(-4, 4, 30),
           "solver": ["liblinear"]}

RFGCV = {"n_estimators" : [1000,1200],
          "max_depth": [10, 12],
          "min_samples_split": [6, 12],
          "min_samples_leaf": [20, 22]}


LRGS = GridSearchCV(LogisticRegression(),param_grid = LGCV, cv = 5, verbose =True)

RFGS = GridSearchCV(RandomForestClassifier(), param_grid = RFGCV, cv =5, verbose = True)

LRGS.fit(x_train,y_train).best_params_, RFGS.fit(x_train, y_train).best_params_



In [None]:
LRGS.score(x_test, y_test), RFGS.score(x_test, y_test)

## Different metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, plot_roc_curve
from sklearn.metrics import confusion_matrix, classification_report
y_pred = LRGS.predict(x_test)

y_pred

In [None]:
plot_roc_curve(LRGS, x_test, y_test);
plot_roc_curve(RFGS, x_test, y_test);

In [None]:
sns.set(font_scale = 1.5)
def plot_conf_mat(y_test,y_pred):
    fig, ax = plt.subplots(figsize= (5,5))
    ax = sns.heatmap(confusion_matrix(y_test, y_pred),
                    annot = True,
                    cbar = False)
    plt.xlabel("True label")
    plt.ylabel("Predictedlabel")
    
plot_conf_mat(y_test, y_pred)


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
est = LogisticRegression(solver ="liblinear", C = 0.23357214690901212)


cvs_p = cross_val_score(est, X, Y, cv = 10, scoring = "precision")
cvs_p = np.mean(cvs_p)
cvs_p




**The recall_score can be increased with more data. so that there are no false negetives i.e: there are no missing medical condition who has heart_disease**