## Loading the required libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score,train_test_split,RepeatedKFold,KFold,GridSearchCV
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,mean_squared_error,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Creating the objects of the classes we require further

In [None]:
ss=StandardScaler()
svm=SVC()
knn=KNeighborsClassifier()
rf=RandomForestClassifier()
ad=AdaBoostClassifier()
xg=XGBClassifier()

1. ### age      >>   age
1. ### sex      >>   sex
1. ### cp       >>   chest pain type (4 values)
1. ### trestbps >>   resting blood pressure
1. ### chol     >>   serum cholestoral in mg/dl
1. ### fbs      >>   fasting blood sugar > 120 mg/dl
1. ### restecg  >>   resting electrocardiographic results (values 0,1,2)
1. ### thalach  >>   maximum heart rate achieved
1. ### exang    >>   exercise induced angina
1. ### oldpeak  >>   ST depression induced by exercise relative to rest
1. ### slope    >>   the slope of the peak exercise ST segment
1. ### ca       >>   number of major vessels (0-3) colored by flourosopy
1. ### thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 
1. ### target   >>   0 indicates healthy and 1 indicates illness

## Here,we are splitting the dataset into two sets i.e. train and test set.

In [None]:
dt=pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
x=dt.iloc[:,:-1]
y=dt.iloc[:,-1]

# Splitting the dataset into train and test set in 80/20 ration for evaluating the model performance 
X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=42)
X_test.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)
X_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)


final_X_test=X_test.to_numpy()
final_y_test=y_test.to_numpy()
x=X_train
y=y_train


## Next, we will perform feature selection on the given dataset to find the best features to do our classification task based on their score using SelectKBest class of sklearn. To decide the value of k we will plot the mean squared error curve at different values of k. We are performing this operation on only the train set with 10 fold validation and here we are using KNearestNeighbour as our base model for getting the final scores

In [None]:

kf=KFold(n_splits=10)

final_error={}
indices={}

k=x.shape[1]
for i in range(1,k+1):
    sk=SelectKBest(chi2, k=i)
    data_temp=sk.fit_transform(x, y)
    indices[i]=(list(sk.get_support(True)))

    error=list()
    for train_index,test_index in kf.split(data_temp):
        X_train,y_train=data_temp[train_index],y[train_index]
        X_test,y_test=data_temp[test_index],y[test_index]
        X_train=ss.fit_transform(X_train)
        X_test=ss.transform(X_test)
        knn.fit(X_train,y_train)
        error.append(mean_squared_error(y_test,knn.predict(X_test)))
    final_error[i]=error[0]

In [None]:
print('Optimal features are: ',pd.DataFrame(indices.values(),index=final_error.values()).sort_index().iloc[0,:].values)
plt.plot(np.array(list(final_error.keys())),final_error.values())

In [None]:
temp=pd.DataFrame(indices.values(),index=final_error.values()).sort_index().iloc[0,:]
ind=list(temp[temp.notnull()].values.astype(int))


## Using only the selected columns from the given dataset. 

In [None]:
xcopy=x.iloc[:,ind]
xcopy.columns=list(range(xcopy.shape[1]))
xcopy=xcopy.to_numpy()
final_X_test=final_X_test[:,ind]

## Now we are training different models like SVM, KNearestNeighbour, RandomForest, AdaBoost with the selected features in the training set only. We are using RepeatedKfold class and splitting the training set into 10 folds and we are repeating all evaluations for 15 iterations. After splitting we transformed the data using standard scalar and finally we are using accuracy as the performance metrics to evaluate our models performances    

In [None]:
models=[svm,knn,rf,ad]
rkf=RepeatedKFold(n_splits=10,n_repeats=15)
result=[]
for train_index,test_index in rkf.split(xcopy):
    X_TRAIN,y_TRAIN=xcopy[train_index],y[train_index]
    X_TEST,y_TEST=xcopy[test_index],y[test_index]
    X_TRAIN=ss.fit_transform(X_TRAIN)
    X_TEST=ss.transform(X_TEST)
    temp=list()
    for model in models:
        model.fit(X_TRAIN,y_TRAIN)
        temp.append(accuracy_score(y_TEST,model.predict(X_TEST)))
    result.append(temp)
        

## The obtianed accuracy with differnt models in each of the 15 iterations are shown below in the Dataframe:

In [None]:
result_frame=pd.DataFrame(result)

result=[]
for i in range(0,result_frame.shape[0],10):
    result.append(result_frame.iloc[i:i+10,:].mean())

result=pd.DataFrame(result)
result.columns=['svm','knn','rf','ad']
result
# print('\n')
#

In [None]:
 print("The mean result of the 15 iterations are: ",result.mean())


## Now i am just checking the model performance on the test set that we created above. The result may be biased because it may happens that the distribution of test set is not as same as the train set or this test set is easier or difficult for the model to recognize. I am just doing it to avoid any Data leakage and just for getting the whole classification report on a test set.

In [None]:
x_transformed=ss.fit_transform(xcopy)
transformed_final_X_test=ss.transform(final_X_test)
for model in models:
    model.fit(x_transformed,y)
    target=['class 0', 'class 2']
    print('for {} the results are: '.format(model))
    print('\n')
    print(classification_report(final_y_test,model.predict(transformed_final_X_test),target_names=target))
    print('\n')
    print('\n')



## Above are the classification reports of each classifier. We don't need to use accuracy_score, precision, recall, f1 score separately. We can do it in just one go using the above used class. Isn't this COOL!.

## Now we will do some hyperparameters tuning of the models that we are using to check how much it can impact the performance of our models. I am using GridSearchCV for this. You can use other methods for this also.

### 1. SVM

In [None]:
parameters = {'kernel':['rbf','poly','linear'], 'C':[0.1,0.5,1,5,10,100],'gamma':('scale','auto')}
clf = GridSearchCV(svm, parameters, refit = True, verbose = 1)
clf.fit(x_transformed,y)

sorted(clf.cv_results_.keys())


In [None]:
svm_param=clf.best_params_
svm_param

## 2. Random Forest  

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 5, stop = 50, num = 10)]
max_depth = [int(x) for x in np.linspace(1, 50, num = 10)]
# max_depth.append(None)
min_samples_split = [2, 5,8]
min_samples_leaf = [1, 2,6]
parameters = {'n_estimators': n_estimators,
               
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}



clf = GridSearchCV(rf, parameters, refit = True, verbose = 1)
clf.fit(x_transformed,y)

sorted(clf.cv_results_.keys())


In [None]:
rf_param=clf.best_params_
rf_param

## 3. KNN


In [None]:
knn=KNeighborsClassifier()
leaf_size = list(range(1,40,4))
n_neighbors = list(range(1,30,3))
p=[1,2,4]
weights=['uniform', 'distance']
parameters={'leaf_size':leaf_size,'n_neighbors':n_neighbors,'p':p}

clf=GridSearchCV(knn,param_grid=parameters,refit=True,verbose=1)
clf.fit(x_transformed,y)

sorted(clf.cv_results_.keys())



In [None]:
knn_param=clf.best_params_
knn_param

## 4. AdaBoost

In [None]:
# AdaBoostClassifier()
n_estimators=list(range(1,40,5))
learning_rate=[0.01,0.1,0.6,1]
parameters={'n_estimators':n_estimators,'learning_rate':learning_rate}
clf=GridSearchCV(ad,param_grid=parameters,refit=True,verbose=1)
clf.fit(x_transformed,y)

sorted(clf.cv_results_.keys())


In [None]:
ad_param=clf.best_params_
ad_param

## Now using these hyperparamters, i will again test the models on the test set as i did above with to check whether performance imroved or not. 

In [None]:
svm=SVC(C= svm_param['C'], gamma= svm_param['gamma'], kernel= svm_param['kernel'])
knn=KNeighborsClassifier(leaf_size= knn_param['leaf_size'], n_neighbors= knn_param['n_neighbors'], p=knn_param['p'])
rf=RandomForestClassifier(max_depth= rf_param['max_depth'],min_samples_leaf= rf_param['min_samples_leaf'], min_samples_split= rf_param['min_samples_split'],n_estimators= rf_param['n_estimators'])
ad=AdaBoostClassifier(learning_rate= ad_param['learning_rate'], n_estimators= ad_param['n_estimators'])

models=[svm,knn,rf,ad]

for model in models:
    model.fit(x_transformed,y)
    target=['class 0', 'class 2']
    print('for {} the results are: '.format(model))
    print('\n')
    print(classification_report(final_y_test,model.predict(transformed_final_X_test),target_names=target))
    print('\n')
    print('\n')


## Above shown arre the classification reports of each classifier trained on the transformed train set and tested on the test set. 

## Here i am again doing tests on the Train set with 10 fold validation to cross check the results.

In [None]:

rkf=RepeatedKFold(n_splits=10,n_repeats=15)
result=[]
for train_index,test_index in rkf.split(xcopy):
    X_TRAIN,y_TRAIN=xcopy[train_index],y[train_index]
    X_TEST,y_TEST=xcopy[test_index],y[test_index]
    X_TRAIN=ss.fit_transform(X_TRAIN)
    X_TEST=ss.transform(X_TEST)
    temp=list()
    for model in models:
        model.fit(X_TRAIN,y_TRAIN)
        temp.append(accuracy_score(y_TEST,model.predict(X_TEST)))
    result.append(temp)
result_frame=pd.DataFrame(result)

In [None]:
result=[]
for i in range(0,result_frame.shape[0],10):
    result.append(result_frame.iloc[i:i+10,:].mean())

result=pd.DataFrame(result)
result.columns=['svm','knn','rf','ad']
result.mean()

## Finally, we can see that for the test set evaluation case, the accuracy of all classiefiers besides SVM increased but in the case of 10 fold validation on training data, the accuracy only in case of AdaBoost classifier increased.

## Just vary the random state value while splitting the dataset into train/test set and see the results before and after tuning.