In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.stats import skew, norm 
from warnings import filterwarnings as filt

filt('ignore')
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (12,6)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

using random forest and decision tree for testing cost complexity pruning 

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
df.shape

In [None]:
df.head()

In [None]:
df.isnull().values.sum()

In [None]:
from eli5 import show_weights
from eli5.sklearn import PermutationImportance 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

In [None]:
sns.countplot(df.target)

In [None]:
x = df.drop(['target'], axis = 1)
y = df.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [None]:
def permImp(x, y):
    model = RandomForestClassifier().fit(x, y)
    perm = PermutationImportance(model).fit(x, y)
    return show_weights(perm, feature_names = x.columns.tolist())

In [None]:
permImp(x, y)

In [None]:
sns.pairplot(df.drop('fbs', axis = 1), hue = 'target')

In [None]:
sns.heatmap(df.corr(), fmt = '.1f', annot = True, cmap = 'gnuplot')

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, Normalizer, MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import plot_tree, DecisionTreeClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

In [None]:
def best_model(x, y):
    models = [SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
    mnames = ['svm', 'decision tree', 'random forest']
    scalers = [None, StandardScaler(), RobustScaler(), MinMaxScaler(), PowerTransformer(), Normalizer()]
    snames = ['none', 'std', 'robust', 'min max', 'power transformer', 'normalizer']
    scores = [[] for _ in range(len(snames))]
    iterr = 0
    print(f'total no. iteration : {len(mnames) * len(snames)}')
    
    for model in models:
        for ind , scaler in enumerate(scalers):
            if scaler:
                model = Pipeline(steps = [('scaler', scaler), ('model', model)])
            
            cv = StratifiedKFold(10, shuffle = True)
            score = cross_val_score(model, x, y, cv = cv, scoring = 'f1').mean()
            scores[ind].append(score)
            
            iterr += 1
            print(f'iteration no. :======> {iterr} / {len(mnames) * len(snames)}')
            
    return pd.DataFrame(scores, index = snames, columns = mnames).T            

def get_score(xt, yt, xtest, ytest, model, scaler = None):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    model.fit(xt, yt)
    pred = model.predict(xtest)
    print(' Report '.center(70, '='))
    print()
    print(f"training score :==> {model.score(xt, yt)}")
    print(f"testing score  :==> {model.score(xtest, ytest)}")
    print(f"roc auc score  :==> {roc_auc_score(ytest, pred)}")
    print()
    print(classification_report(ytest, pred))
    sns.heatmap(confusion_matrix(ytest, pred), fmt = '.1f', annot = True)
    
def gridcv(x, y, model, params, scaler = None, fold = 10):
    if scaler:
        model = Pipeline(steps = [('scaler', scaler), ('model', model)])
    cv = StratifiedKFold(fold, shuffle = True)
    clf = GridSearchCV(model, param_grid = params, cv = cv, return_train_score = 'True')
    clf.fit(x, y)
    results = pd.DataFrame(clf.cv_results_)
    return clf, results[['mean_train_score', 'mean_test_score', 'params']]

def plot_cv(res):
    sns.lineplot(x = res.index, y = res.mean_train_score)
    sns.lineplot(x = res.index, y = res.mean_test_score)
    plt.title('accuracy comparision for train and test set')
    plt.legend(['training score', 'testing score'])

In [None]:
# fbs feats wasnt useful to the model at all , so lets drop it 

x_train = x_train.drop(['fbs'], axis = 1)
x_test = x_test.drop(['fbs'], axis = 1)

In [None]:
best_model(x_train, y_train)

### drc without pruning 

In [None]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
plt.figure(figsize = (18,15))
plot_tree(clf, filled = True, feature_names = x_train.columns, fontsize=12);

In [None]:
print(f"training score : {clf.score(x_train, y_train)}")
print(f"testing score : {clf.score(x_test, y_test)}")

### pruning

In [None]:
path = clf.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
ccp_alphas

In [None]:
impurities

In [None]:
clf , results = gridcv(x_train, y_train, DecisionTreeClassifier(), {'model__ccp_alpha' : ccp_alphas}, StandardScaler())

In [None]:
plot_cv(results)

hence we were able to get low bais and low variance with the help of grid search and ccp alpha

In [None]:
results.sort_values('mean_test_score', ascending = False ).head()

In [None]:
clf.best_params_

In [None]:
get_score(x_train, y_train, x_test, y_test, clf)

### ccp alpha for random forest

In [None]:
clf, results = gridcv(x_train, y_train, RandomForestClassifier(), {'model__ccp_alpha' : ccp_alphas}, StandardScaler())

In [None]:
plot_cv(results)

In [None]:
results.sort_values('mean_test_score', ascending = False).head()

In [None]:
get_score(x_train, y_train, x_test, y_test, clf)

### svm

In [None]:
get_score(x_train, y_train, x_test, y_test, SVC(), RobustScaler())

In [None]:
params = {
        'C' : [0.1, 1, 2, 3, 4, 5, 10, 20, 50, 100, 200],
        'kernel' : ['rbf', 'poly', 'sigmoid'],
        'gamma' : ['scale', 'auto'],
        'class_weight' : [None, 'balanced']
}

pip_params = {f"model__{key}" : values for key, values in params.items()}
pip_params

In [None]:
clf, results = gridcv(x_train, y_train, SVC(), pip_params, RobustScaler())

In [None]:
plot_cv(results)

In [None]:
results.sort_values('mean_test_score', ascending = False).head()

In [None]:
clf

In [None]:
results.sort_values('mean_test_score', ascending = False).iloc[0,-1]

In [None]:
get_score(x_train, y_train, x_test, y_test, SVC(class_weight='balanced', gamma='auto'), RobustScaler())

the above svc model gave the highest score of 87%