In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')

In [None]:
df.sample(5)

In [None]:
# New imports 
import matplotlib.colors as colors

In [None]:
df = df.rename(columns={'trestbps': 'restbp'})

## Column Names Full forms

* Age 
* Sex
* cp - Chest Pain
* restbp - Resting Blood Pressure
* chol - serum cholestrol
* fbs - fasting blood sugar
* restecg - restecg
* thalach - max heartrate achieved
* exang - excercise induced angina
* oldpeak - ST depression induced by excercise relative to rest
* slope - slope of peak excercise ST segment
* ca - no. of major vessels coloured by flouroscopy
* thal - thalium heart scan 
* target - diagnosis of heart disease

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

* The data is completely clean and has no null values 

In [None]:
df.describe()

### Dividing the data into Dependent and Independent variables

In [None]:
x = df.drop(columns=['target']).copy()
y = df['target'].copy()

* One hot encoding is required for cp,restecg,slope,ca,thal

In [None]:
x_en = pd.get_dummies(x,columns=['cp','restecg','slope','ca','thal'])

### Dividing into training and testing sets 

In [None]:
from sklearn.model_selection import train_test_split as tts 
x_train,x_test,y_train,y_test = tts(x_en,y,test_size=0.4,random_state=23)

* The radial basis function we are using along with the support vector machine assumes data is centered and scaled 
* So we need to do this for both training and testing sets
* We split the data and then scale to avoid data leakage 
* Data leakage - info in training data corrupts or influences testing data

In [None]:
from sklearn.preprocessing import scale
x_train_s = scale(x_train)
x_test_s  = scale(x_test)

### Building a preliminary Support Vector Classifier

In [None]:
from sklearn.svm import SVC
svc = SVC(random_state=344)
svc.fit(x_train,y_train)
y_pred = svc.predict(x_test)

In [None]:
score = svc.score(x_test,y_test)
score

### Plotting a Confusion Matrix

In [None]:
def confusion(test, predict, labels, title='Confusion Matrix'):
    '''
        test: true label of test data, must be one dimensional
        predict: predicted label of test data, must be one dimensional
        labels: list of label names, ie: ['positive', 'negative']
        title: plot title
    '''

    bins = len(labels)
    # Make a 2D histogram from the test and result arrays
    pts, xe, ye = np.histogram2d(test, predict, bins)

    # For simplicity we create a new DataFrame
    pd_pts = pd.DataFrame(pts.astype(int), index=labels, columns=labels )
    
    # Display heatmap and add decorations
    hm = sns.heatmap(pd_pts, annot=True, fmt="d")    
    hm.axes.set_title(title, fontsize=20)
    hm.axes.set_xlabel('Predicted', fontsize=18)
    hm.axes.set_ylabel('Actual', fontsize=18)

    return None

In [None]:
confusion(y_test, y_pred, ['Does not have HD', 'Has HD'], title='Support Vector Classifier')

### Optimizing SVC with Cross Validation

In [None]:
# making a parameters grid
param_grid = [{'C':[1,10,100,1000],
               'gamma':[0.001,0.0001],
              'kernel':['rbf']}] #radial basis function

from sklearn.model_selection import GridSearchCV

optimal_params = GridSearchCV(SVC(),param_grid,cv=10,verbose=0)
optimal_params.fit(x_train_s,y_train)
optimal_params.best_params_

### Sub the parameters for SVC

In [None]:
svc = SVC(random_state=334,C=10,gamma=0.0001,kernel='rbf')
svc.fit(x_train_s,y_train)
y_pred = svc.predict(x_test)
score = svc.score(x_test,y_test)
score

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train_s,y_train)
score = rfc.score(x_test,y_test)
score