In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report
import pandas_profiling as pp

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data extraction

In [None]:
df = pd.read_csv("../input/health-care-data-set-on-heart-attack-possibility/heart.csv")

## Profiling Report

In [None]:
df.isnull().sum()
df.info()

pp.ProfileReport(df)

In [None]:
y = df['target']
X = df.drop('target', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Preprocessing

**For this task I used 5 most popular classification models. Performance estimator = f1_score.**

In [None]:
# importing libraries

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB


clfs = {
    'LogisticR': LogisticRegression(),
    'SGD': SGDClassifier(penalty='elasticnet', alpha=0.005),
    'Random Forest': RandomForestClassifier(n_estimators=1000),
    'SVC': LinearSVC(C=1, loss='hinge', max_iter=10000),
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'GNB': GaussianNB()
}

# Training & Testing loop

for i, (name, clf) in enumerate(clfs.items()):
    if name == 'LogisticR':
        log = pd.Series(clf.fit(X_train, y_train).predict(X_test))
    elif name == 'SGD':
        sgd = pd.Series(clf.fit(X_train, y_train).predict(X_test))
    elif name == "Random Forest":
        randomforest = pd.Series(clf.fit(X_train, y_train).predict(X_test))
    elif name == "SVC":
        svc = pd.Series(clf.fit(X_train, y_train).predict(X_test))
    elif name == "GNB":
        gnb = pd.Series(clf.fit(X_train, y_train).predict(X_test))
    elif name == "KNN":
        knn = pd.Series(clf.fit(X_train, y_train).predict(X_test))

In [None]:
preds = pd.concat([log, sgd, randomforest, svc, gnb, knn], axis=1, keys=['log','sgd','rf','svc','gnb','knn'])

In [None]:
from sklearn.metrics import f1_score
for i in preds.columns:
    print('F1_score of %s model is %f' % (i,f1_score(y_test,preds[i])))

**Apparently, the model with the best performance is SGD.**
**Let's try it with polynomial features.**

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2)

X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.fit_transform(X_test)

cross_val_score(SGDClassifier(penalty='elasticnet', alpha=0.0001), X_test_poly, y_test, cv=3)

**Okay, so the CV score of polynomial features ges lower as we add new dimensions. Logically, the relationships between features are more linear-like than quadratic or cubic.**

## Plotting learning curves

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(GaussianNB(), 
                                                        X, 
                                                        y,
                                                        # Number of folds in cross-validation
                                                        cv=10,
                                                        # Evaluation metric
                                                        scoring='f1',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 30))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("F1 Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

**We can observe, that F1 score stabilizes as the model gets more samples to train on.**

In [None]:
from sklearn.metrics import precision_recall_curve

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_scores = gnb.predict_proba(X_train)[:,1]
#For SGDClassifier, use decision_function.

precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)

def plot_prc (precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='Precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
    plt.xlabel('Thresholds')
    plt.legend(loc='center left')
    plt.ylim([0,1])

plot_prc(precisions, recalls, thresholds)

**And this is how we can shift the decision boundry to play with the precision/recall proportion.**

In [None]:
from sklearn.metrics import recall_score
y_pred = (gnb.predict_proba(X_test)[:,1] >= 0.1).astype(bool) 

y_pred2 = (gnb.predict_proba(X_test)[:,1] >= 0.9).astype(bool) 

In [None]:
print('If we set the threshold to 0.1, then we get a recall score of %s' % recall_score(y_test, y_pred))
print('If we set the threshold to 0.9, then we get a recall score of %s' % recall_score(y_test, y_pred2))