# Introduction

## This notebook contains End to End process for classification of HeartDisease
### Please feel free to provide your suggestions. Upvote if you found it helpful

## Describing Dataset
1. Age: age of the patient [years]
2. Sex: sex of the patient [M: Male, F: Female]
3. ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
4. RestingBP: resting blood pressure [mm Hg]
5. Cholesterol: serum cholesterol [mm/dl]
6. FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
7. RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
8. MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]
9. ExerciseAngina: exercise-induced angina [Y: Yes, N: No]
10. Oldpeak: oldpeak = ST [Numeric value measured in depression]
11. ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
12. HeartDisease: output class [1: heart disease, 0: Normal]


# Imports

In [1]:
!pip install pandas_flavor

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression, ridge_regression, Lasso, ElasticNet, RidgeClassifierCV, ElasticNetCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


# Custom methods for pandas

In [1]:
from pandas_flavor import register_dataframe_method,register_series_method
from IPython.core.display import display, HTML

@register_dataframe_method
def missing(df):        
    tmp =  sorted(
                [(col , str(df[col].dtypes) ,df[col].isna().sum(), np.round( df[col].isna().sum() / len(df) * 100,2) ) for col in df.columns if df[col].isna().sum() !=0 ],
                key = lambda x: x[2], reverse=True)
    
    return pd.DataFrame(tmp).rename({0:"Feature", 1:"dtype", 2:"count", 3:"percent"},axis=1)  

@register_dataframe_method
def get_numeric_df(df):
    return df.select_dtypes(np.number)

@register_dataframe_method
def get_numeric_cols(df):
    return list(df.select_dtypes(np.number).columns)

@register_dataframe_method
def discrete_features_cols(df,thresold):
#     thresold in number of unique values
    return [feature for feature in df.columns if len(df[feature].unique()) < thresold]

@register_dataframe_method
def discrete_features_df(df,thresold):
#     thresold in number of unique values
    return df[ discrete_features_cols(df=df,thresold=thresold) ]

@register_dataframe_method
def continuous_features_cols(df,thresold):
    #     thresold in number of unique values
    return [feature for feature in df.columns if len(df[feature].unique()) >= thresold]

@register_dataframe_method
def continuous_features_df(df,thresold):
    #     thresold in number of unique values
    return df[ continuous_features_cols(df=df,thresold=thresold) ]

@register_dataframe_method
def dtypes_of_cols(df):
    return pd.DataFrame(df.dtypes).reset_index().rename(columns={'index':"Columns",0: "dtype"})

@register_dataframe_method
def describe_discrete_cols(df,thresold, ascending=True):
    
    values = pd.DataFrame()
    
    for col in df.discrete_features_cols(thresold=thresold):
        values[col] = [df[col].unique(), df[col].nunique()]
        
    return values.transpose().sort_values(by = 1,ascending=ascending).rename({0:"Values",1:"cardinality"},axis=1)
    
    
@register_series_method
def IQR_range(df):
    if isinstance(df, pd.Series):
        Q3 = np.quantile(df, 0.75)
        Q1 = np.quantile(df, 0.25)
        IQR = Q3 - Q1

        lower_range = Q1 - 1.5 * IQR
        upper_range = Q3 + 1.5 * IQR

        return (lower_range,upper_range)
    else:
        assert False, "df must be of type pandas.Series"
        
@register_dataframe_method
def IQR_range(df):
    if isinstance(df, pd.DataFrame):
        cols = df.get_numeric_cols()
        features = {}
        for i in cols:
            Q3 = np.quantile(df[i], 0.75)
            Q1 = np.quantile(df[i], 0.25)
            IQR = Q3 - Q1

            lower_range = Q1 - 1.5 * IQR
            upper_range = Q3 + 1.5 * IQR


            features[i] = (lower_range,upper_range)
            
        return pd.DataFrame.from_dict(features,orient='index').rename({0: 'IQR_Low',1: 'IQR_High'}, axis=1)
    else:
        assert False, "df must be of type pandas.DataFrame"
        
@register_dataframe_method
def compare_cols(df,l_feat,r_feat, percent=False, percent_of_total=False):
    
#     [L_feat] {R_feat1: agg1, R_feat2: agg2}

    
    if percent or percent_of_total:
        
        comp = []
        for key, val in zip(r_feat,r_feat.values()):
            tmp = pd.DataFrame()
            tmp[key + " " + val] =  df.groupby(l_feat,sort=True).agg({key: val})
            
            if percent: tmp[key +" %"] = tmp.groupby(level=0).apply(lambda x: np.round(100 * x / float(x.sum()),2))

            if percent_of_total: tmp[key+" % of total"] = np.round(tmp[key + " " + val] / tmp[key + " " + val].sum() * 100 , 2)
            
            comp.append(tmp)
            
        return comp
    
    else:
        comp = []
        for key, val in zip(r_feat,r_feat.values()):
            tmp = pd.DataFrame()
            tmp[key + " " + val] =  df.groupby(l_feat,sort=True).agg({key: val})           
            comp.append(tmp)
            
        return comp  
    
@register_series_method
def IQR_percent(df):
    if isinstance(df, pd.Series):
        
        lower_range, upper_range = df.IQR_range()

        length = len(df)
        return np.round((length - df.between(lower_range,upper_range).sum())/length * 100, 2)
    else:
        assert False, "df must be of type pandas.Series"

@register_dataframe_method
def IQR_percent(df):
    if isinstance(df, pd.DataFrame):
        cols = df.get_numeric_cols()
        features = {}
        for i in cols:
            lower_range, upper_range = df[i].IQR_range()

            length = len(df[i])
            tmp = np.round((length - df[i].between(lower_range,upper_range).sum())/length * 100, 2)
            if tmp != 0:
                features[i] = tmp
#             features[i] = IQR_percent(df[i])
            
        return pd.DataFrame.from_dict(features,orient='index').rename({0: 'Outlier percent'}, axis=1)
    else:
        assert False, "df must be of type pandas.DataFrame"
        
@register_dataframe_method
def drop_row_outlier(df, cols, inplace=False):
#     init empty index
    indices = pd.Series(np.zeros(len(df), dtype=bool), index=df.index)

    for col in cols:
        low, top = df[col].IQR_range()
        indices |= (df[col] > top) | (df[col] < low)
        
    
    return df.drop(df[ indices ].index, inplace=inplace)

@register_series_method
def drop_row_outlier(df, inplace=False):
#     init empty index

    low, top = df.IQR_range()
        
    
    return df.drop(df[ (df > top) | (df < low) ].index, inplace=inplace)

@register_dataframe_method
def count_dtypes(df, ascending=False):
    return pd.DataFrame(df.dtypes.value_counts(ascending=ascending)).rename({0:"Count"},axis=1)

@register_dataframe_method
def about(df):

    display(HTML('<h1 style="color:green"> <b> Shape of data </b> </h1>'))
    print(df.shape)    

    display(HTML('<h1 style="color:green"> <b> Datatypes in data </b> </h1> '))
    print(df.dtypes.value_counts(ascending=False))

    display(HTML('<h1 style="color:green"> <b> dtypes of columns </b> </h1> '))
    display(df.dtypes_of_cols())

    display(HTML('<h1 style="color:green"> <b> Percentage of missing values </b> </h1> '))
    tmp = missing(df)
    display(tmp) if len(tmp) != 0 else display(HTML("<h2> <b> None <b> </h2>"))

    display(HTML('<h1 style="color:green"> <b> Data description </b> </h1> '))
    display(df.describe().T)
    
    display(HTML('<h1 style="color:green"> <b> Outlier Percentage(IQR) </b> </h1> '))
    tmp = df.IQR_percent()
    display(tmp) if len(tmp) != 0 else display(HTML("<h2> <b> None <b> </h2>"))

    display(HTML('<h1 style="color:green"> <b> Example of data </b> </h1> '))
    display(df.head())

In [1]:
def display_multiple_tables(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>')

# Custom methods for plotting

In [1]:
sns.set(style="darkgrid",font_scale=1.3)
plt.rcParams['figure.dpi']=100


from matplotlib.ticker import MaxNLocator

def srt_reg(y, df,x_size=20,y_size=20,*args,**kwargs):
    
    ncols = 3
    nrows = int(np.ceil(df.shape[1]/ncols))
    
    fig, axes = plt.subplots(nrows, ncols, figsize=(x_size,y_size))
    axes = axes.flatten()

    for i, j in zip(df.columns, axes):

        sns.regplot(x=i,
                    y=y,
                    data=df,
                    ax=j,
                    order=3,
                    ci=None,
                    color='#e74c3c',
                    line_kws={'color': 'black'},
                    scatter_kws={'alpha':0.4},
                   *args,**kwargs)
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=10))

        plt.tight_layout()

def srt_box(y, df,*args,**kwargs):
    fig, axes = plt.subplots(19, 3, figsize=(30,30))
    axes = axes.flatten()

    for i, j in zip(df.columns, axes):

        sortd = df.groupby([i])[y].median().sort_values(ascending=False)
        sns.boxplot(x=i,
                    y=y,
                    data=df,
                    palette='plasma',
                    order=sortd.index,
                    ax=j,
                    *args,**kwargs)
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()


        
def histplt(df,ncols = 3, x_size=30,y_size=30,*args,**kwargs):
    
    if len(df.shape) == 1:
        fig, ax = plt.subplots(figsize=(x_size,y_size))
        sns.histplot(x=df,ax=ax,*args,**kwargs)
        [ ax.bar_label(tmp) for tmp in ax.containers]
        
        ax.tick_params(labelrotation=45)
#         plt.tight_layout()
        
    else:
    
#         ncols = 3
        nrows = int(np.ceil(df.shape[1]/ncols))

        fig, axes = plt.subplots(nrows, ncols, 
                                 figsize=(x_size,y_size)
                                )
        axes = axes.flatten()

        for i, j in zip(df.columns, axes):

            sns.histplot(data=df, x=i,ax=j,*args,**kwargs)
            j.tick_params(labelrotation=45)
            [ j.bar_label(tmp) for tmp in j.containers]
    #         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

            plt.tight_layout()
        

def countplt(df,ncols = 3, x_size=30,y_size=30,*args,**kwargs):
    
    if len(df.shape) == 1:
        fig, ax = plt.subplots(figsize=(x_size,y_size))
        sns.countplot(x=df,ax=ax,*args,**kwargs)
        [ ax.bar_label(tmp) for tmp in ax.containers]
        
        ax.tick_params(labelrotation=45)
#         plt.tight_layout()
        
    else:
    
#         ncols = 3
        nrows = int(np.ceil(df.shape[1]/ncols))

        fig, axes = plt.subplots(nrows, ncols, 
                                 figsize=(x_size,y_size)
                                )
        axes = axes.flatten()

        for i, j in zip(df.columns, axes):

            sns.countplot(data=df, x=i,ax=j,*args,**kwargs)
            j.tick_params(labelrotation=45)
            [ j.bar_label(tmp) for tmp in j.containers]
    #         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

            plt.tight_layout()


    
    
def barplt(df,y,x_size=30,y_size=30,*args,**kwargs):
    ncols = 3
    nrows = int(np.ceil(df.shape[1]/ncols))
    
    fig, axes = plt.subplots(nrows, ncols, 
                             figsize=(x_size,y_size)
                            )
    axes = axes.flatten()

    for i, j in zip(df.columns, axes):
        
        if i == y:
            continue

        sns.barplot(data=df,
                    x=i,
                    y=y,
                    ax=j,*args,**kwargs)

        j.tick_params(labelrotation=45)
        [ j.bar_label(tmp) for tmp in j.containers]
#         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()
    
    
def violinplt(df,y,ncols=3,x_size=30,y_size=30,x_scale = "linear", y_scale = "linear", *args,**kwargs):
    
    
    nrows = int(np.ceil(df.shape[1]/ncols))
    
    fig, axes = plt.subplots(nrows, ncols, 
                             figsize=(x_size,y_size)
                            )
    axes = axes.flatten()
    
    if df[y].dtype == 'O':

        for i, j in zip(df.columns, axes):

            if i == y:
                continue

            sns.violinplot(data=df,
                        x=y,
                        y=i,
                        ax=j,*args,**kwargs)
            
            lower_range, upper_range = df[i].IQR_range()
            outliers = df[(df[i] > upper_range) | (df[i] < lower_range)][i]
            sns.scatterplot(y=outliers, x=0, marker='D', color='crimson', ax=j)
            j.tick_params(labelrotation=45)

    #         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

            plt.tight_layout()
        
        
    else:

        for i, j in zip(df.columns, axes):

            if i == y:
                continue

            g = sns.violinplot(data=df,
                        x=i,
                        y=y,
                        ax=j,*args,**kwargs)
            g.set_xscale(x_scale)
            g.set_yscale(y_scale)
            j.tick_params(labelrotation=45)

    #         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

            plt.tight_layout()
        
def boxplt(df,y,x_size=30,y_size=30,*args,**kwargs):

    ncols = 3
    nrows = int(np.ceil(df.shape[1]/ncols))
    
    fig, axes = plt.subplots(nrows, ncols, 
                             figsize=(x_size,y_size)
                            )
    axes = axes.flatten()
    
    if df[y].dtype == 'O':

        for i, j in zip(df.columns, axes):

            if i == y:
                continue

            sns.boxplot(data=df,
                        x=y,
                        y=i,
                        ax=j,*args,**kwargs)

    #         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

            plt.tight_layout()
        
        
    else:

        for i, j in zip(df.columns, axes):

            if i == y:
                continue

            sns.boxplot(data=df,
                        x=i,
                        y=y,
                        ax=j,*args,**kwargs)

    #         j.yaxis.set_major_locator(MaxNLocator(nbins=18))

            plt.tight_layout()


import scipy.stats as stats

def qqplt(df,x_size=30,y_size=30,*args,**kwargs):
    
    if len(df.shape) == 1:
        fig, ax = plt.subplots(figsize=(x_size,y_size))
        stats.probplot(df,plot=ax, *args,**kwargs)
        
#         ax.set_title(label=df.columns)
        ax.tick_params(labelrotation=45)
        ax.yaxis.set_major_locator(MaxNLocator(nbins=10))

#         plt.tight_layout()
        
    
    else:
        ncols = 3
        nrows = int(np.ceil(df.shape[1]/ncols))

        fig, axes = plt.subplots(nrows, ncols, figsize=(x_size,y_size))
        axes = axes.flatten()

        for i, j in zip(df.columns, axes):

            stats.probplot(df[i],plot=j, *args,**kwargs)
            j.set_title(label=i)
            j.tick_params(labelrotation=45)
            j.yaxis.set_major_locator(MaxNLocator(nbins=10))

            plt.tight_layout()

# Importing data and initial impressions

In [1]:
df = pd.read_csv("../input/heart-failure-prediction/heart.csv")
df.about()

# Exploring data

## Lets get a heads up with pandas profiling

In [1]:

from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)

profile.to_widgets()

## Lets look at discrete data

a feature is discrete if it has a relatively lower cardinality compared to size of data. It is irrespective of the datatype.

As seen below `Sex` is discrete because it has `["M","F"]` (dtype=object) and cardinality of 2, same applies to `HeartDisease` even if it has `[0,1]` (dtype = int)

In [1]:
df.describe_discrete_cols(thresold=10)

**There are 7 discrete features as seen above, with respective cardinality**

**lets look at the distribution of each of those**

In [1]:
countplt(df.discrete_features_df(thresold=10),x_size=40,y_size=20)

### Understanding what the data is about


**Chest pain type**
[source](https://www.heart.org/en/health-topics/heart-attack/angina-chest-pain)

`Angina` is chest pain or discomfort caused when your heart muscle doesn't get enough oxygen-rich blood.
It may feel like pressure or squeezing in your chest. The discomfort also can occur in your shoulders, arms, neck, jaw, upper abdomen or back. Angina pain may even feel like indigestion.

[Angina and atypical Angina](https://www.harringtonhospital.org/typical-and-atypical-angina-what-to-look-for/)

*\"Men commonly have the usual kind of angina..."*

*\"Women may have more of a subtle presentation called atypical angina..."*

[Asymptomatic](https://elitecarehouston.com/silent-heart-attacks-what-do-asymptomatic-signs-of-a-heart-attack-mean/)

*\"Nearly half of all heart attacks have no symptoms at all — but that doesn’t mean they’re any less deadly than heart attacks with symptoms...."*

*\"A lot of studies say that silent heart attacks are more common in women than in men...."*

**Fasting Blood sugar**

[source](https://www.medicalnewstoday.com/articles/317536#blood-sugar-charts)

|                      | Target blood sugar levels for people without diabetes | Target blood sugar levels for people with diabetes |
|----------------------|-------------------------------------------------------|----------------------------------------------------|
| Before meals         | 72–99 mg/dl                                           | 80–130 mg/dl                                       |
| 2 hours after a meal | less than 140 mg/dl                                   | less than 180 mg/dl                                |


from data description we know `[1: if FastingBS > 120 mg/dl, 0: otherwise]`


**Resting ECG levels**
[source](https://elentra.healthsci.queensu.ca/assets/modules/ECG/normal_ecg.html)

`[Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]`

**ExerciseAngina**

as explained earlier, it is Angina caused due to exercise `exercise-induced angina [Y: Yes, N: No]`

### from the above explanation lets try to use group by and plots with respect to HeartDisease

In [1]:
countplt(df.discrete_features_df(thresold=10),x_size=40,y_size=20, hue="HeartDisease")

In [1]:
features = df.discrete_features_cols(thresold=10)
features.remove("HeartDisease")

tables = [compare_cols(df=df, l_feat= [col,"HeartDisease"], r_feat={"HeartDisease":"count"}, percent_of_total=True, percent=True)[0] for col in features]
display_multiple_tables(tables)

In [1]:
# display(*df.compare_cols( ["Sex","ChestPainType","HeartDisease"], {"HeartDisease" :"count"}, percent=True, percent_of_total=True) )


display(*compare_cols( df[ (df["Sex"]=="F") & (df["HeartDisease"]==0) ] , ["Sex","HeartDisease","ChestPainType",], { "ChestPainType":"count"}, percent=True,) )

display(*compare_cols( df[ (df["Sex"]=="F") & (df["HeartDisease"]==1) ] , ["Sex","HeartDisease","ChestPainType",], { "ChestPainType":"count"}, percent=True, ) )

In [1]:
display(*compare_cols( df[ (df["Sex"]=="M") & (df["HeartDisease"]==0) ] , ["Sex","HeartDisease","ChestPainType",], { "ChestPainType":"count"}, percent=True,) )
display(*compare_cols( df[ (df["Sex"]=="M") & (df["HeartDisease"]==1) ] , ["Sex","HeartDisease","ChestPainType",], { "ChestPainType":"count"}, percent=True, ) )

### Observations
**HD=Heart Disease**

- ~75% and ~37% of women and men respectively don't have HD
- ~50% of data comprises of men with HD 

**chest pain type**

- if person has ASY, ~80% of times its HD
- if person has ATA, ~83% of times its NOT HD

**Fasting blood sugar**
- if FastingBS = 1(> 120 mg/dl),  ~80% of times its HD

**RestingECG**
- if ST =1, ~66% of times its HD

**Exercise induced Angina**
- if Y, ~85% of times its HD
- if N, ~65% of times its NOT HD

**ST_Slope**
- if Down, ~78% of times its HD
- if Flat, ~83% of times its HD
- if Up, ~80% of times its NOT HD


>*\"Men commonly have the usual kind of angina..."*

>*\"Women may have more of a subtle presentation called atypical angina..."*

>*\"A lot of studies say that silent heart attacks are more common in women than in men...."(Asymptomatic)*

**from the above aggregation, all the above quoted statement from the website does not agree with our data. The distribution between male and female is quite similar.**

## Lets look at continuous features

### Lets look at the distribution

In [1]:
df.continuous_features_cols(thresold=10)

In [1]:
histplt(df.continuous_features_df(thresold=10), x_size=50, y_size=20)

### Handle outliers 

**lets remove outliers for `RestingBP` `Oldpeak`**

In [1]:
df.drop_row_outlier(cols = ['RestingBP', 'Oldpeak'], inplace=True)

histplt(df.continuous_features_df(thresold=10), x_size=50, y_size=20)

In [1]:
cont_feat = df.continuous_features_cols(thresold=10)
cont_feat.append("HeartDisease")

histplt(df = df[cont_feat], x_size=50, y_size=25, hue="HeartDisease", multiple='stack')

# Encoding features

In [1]:
from sklearn.preprocessing import LabelEncoder

In [1]:
df.describe_discrete_cols(thresold=10)

In [1]:
obj_cols = list(df.select_dtypes(include="object").columns)
obj_cols

In [1]:
df[obj_cols] = df[obj_cols].apply(LabelEncoder().fit_transform)

In [1]:
df.describe_discrete_cols(thresold=10)

# Handling Skew

In [1]:
df.continuous_features_df(thresold=10).skew()

In [1]:
histplt(df["Oldpeak"],y_size=5,x_size=20)
df["Oldpeak"].skew()

In [1]:
pt = PowerTransformer(standardize=False)

df["Oldpeak"] = pt.fit_transform(df[["Oldpeak"]])

histplt(df["Oldpeak"],y_size=5,x_size=20)
df["Oldpeak"].skew()

# Building models

In [1]:
x = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]

sc = StandardScaler()
# with standardscaler
x_sc = sc.fit_transform(X=x)

lr = LogisticRegression(max_iter=1000)
svc= SVC()
knn = KNeighborsClassifier()
ridge = RidgeClassifierCV()
rf = RandomForestClassifier()

models = {
    'lr':lr,
    'svc':svc,
    'knn':knn,
    'rf':rf
}


## all models without standard scaler

In [1]:
scores = {}
for name, model in zip(models, models.values()):
    scores[name] = np.round(cross_val_score(model,X=x,y=y, scoring="recall", cv = 10), 3)

for key in scores.keys():
    print(f'{key : <5} {np.round(sum(scores[key])/len(scores[key]) * 100, 2)} %', np.round(np.std(scores[key]) * 100, 2))


## all models with standard scaler

In [1]:
scores = {}
for name, model in zip(models, models.values()):
    scores[name] = np.round(cross_val_score(model,X=x_sc,y=y, scoring="recall", cv = 10), 3)

for key in scores.keys():
    print(f'{key : <5} {np.round(sum(scores[key])/len(scores[key]) * 100, 2)} %', np.round(np.std(scores[key]) * 100, 2))

**clearly standard scaling helps in increasing accuracy**

**svc and rf have the best accuracy**

# Best model

**SVC turns out to be the best model with Average Recall value of 89% with 10 fold CV**