# Name: Jay Shah
# Date: 09-07-2021
# Heart Disease Analysis

###  Importing the basic libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
from matplotlib.cm import rainbow

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import classification_report

### Reading the data from CSV file

In [None]:
data = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
data

### Describing the data

In [None]:
data.describe()

In [None]:
data.info()

### Visualizing the correlation among columns present in the dataset

In [None]:
rcParams['figure.figsize'] = 20, 14
plt.matshow(data.corr())
plt.yticks(np.arange(data.shape[1]), data.columns)
plt.xticks(np.arange(data.shape[1]), data.columns)
plt.colorbar()

### Visualizing the count of each target classes in order to check whether the dataset is imbalanced or not

In [None]:
rcParams['figure.figsize'] = 6,4
plt.bar(data['target'].unique(), data['target'].value_counts(), color = ['blue', 'black'])
plt.xticks([0, 1])
plt.xlabel('Target Classes')
plt.ylabel('Count')
plt.title('Count of each Target Class')

### Calculation of unique values in each column present in the dataset.

In [None]:
for i in data:
    print("Total unique values in ",i, " column are :", data[i].nunique())

### Converting the categorical column to dummy variables

In [None]:
data = pd.get_dummies(data, columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])
data

### Standardizing the values of some columns whose values are large and so they do not behave badly and look like standard normally distributed data.

In [None]:
scaler = StandardScaler()
cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
data[cols] = scaler.fit_transform(data[cols])
data

### Splitting into training and testing set

In [None]:
X = data.drop(['target'], axis = 1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [None]:
print("Total training examples are: ",X_train.shape[0])
print("Total testing examples are: ",X_test.shape[0])

# Support Vector Classifier
### In below block, SVC is used along with different kernels like 'rbf','linear','poly','sigmoid' and hence scores of all different kernels are calculated.The best score of kernel is selected for further classification report.

In [None]:
svc_scores = []
kernel = ['rbf','linear','poly','sigmoid']

for i in range(len(kernel)):
    model_svc = SVC(kernel = kernel[i])
    model_svc.fit(X_train,y_train)
    svc_scores.append(model_svc.score(X_test,y_test)*100)
    ans = max(svc_scores)
print("Maximum Score by using various kernels of SVM model in percentage is:",ans)

cnt = 0
for i in svc_scores:
    if i == ans:
        main_kernel = kernel[cnt]
    else:
        cnt = cnt + 1

In [None]:
colors = rainbow(np.linspace(0, 1, len(kernel)))
plt.bar(kernel, svc_scores, color = colors)
for i in range(len(kernel)):
    plt.text(i, svc_scores[i], svc_scores[i])
plt.xlabel('Kernels')
plt.ylabel('Scores')
plt.title('Support Vector Classifier scores for different kernels')

In [None]:
model_svc = SVC(kernel = main_kernel)
model_svc.fit(X_train,y_train)
y_pred = model_svc.predict(X_test)
target_names = ['class-0','class-1']
print(classification_report(y_test, y_pred, target_names=target_names))

# Decision Tree Classifier
### In the below block of code, Decision Tree Classifier is used along with some of the hyperparameters which are required to obtain good score.The best hyperparameters are selected which are further used in classification report.

In [None]:
dtc_scores = []
for i in range(1,len(X.columns) + 1):
    dtc_model = dtc(splitter='random',max_depth=8,max_features = i,random_state=0)
    dtc_model.fit(X_train,y_train)
    dtc_scores.append(dtc_model.score(X_test,y_test)*100)
    ans = max(dtc_scores)
print("Maximum Score by using various features of Decision Tree model in percentage is:",ans)

cnt = 1
for i in dtc_scores:
    if i == ans:
        max_features = cnt
    else:
        cnt = cnt + 1

In [None]:
plt.plot([i for i in range(1, len(X.columns) + 1)], dtc_scores, color = 'red')
for i in range(1, len(X.columns) + 1):
    plt.text(i, dtc_scores[i-1], (i, dtc_scores[i-1]))
plt.xticks([i for i in range(1, len(X.columns) + 1)])
plt.xlabel('Maximum features')
plt.ylabel('Scores')
plt.title('Decision Tree Classifier scores for different number of maximum features')

In [None]:
dtc_model = dtc(splitter='random',max_depth=8,max_features = max_features,random_state=0)
dtc_model.fit(X_train,y_train)
y_pred = dtc_model.predict(X_test)
target_names = ['class-0','class-1']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
#Grid Search CV for finding best hyper-parameters for Decision Tree Classifier
#sc = StandardScaler()
dt = dtc()
pipe = Pipeline(steps=[('decisiontree', dt)])
criterion = ['gini', 'entropy']
splitter = ['best','random']
max_depth = [4,6,8,12]
max_features = list((range(1,len(X.columns) + 1)))
params = dict(decisiontree__criterion=criterion,
              decisiontree__splitter=splitter,
              decisiontree__max_depth=max_depth,
              decisiontree__max_features=max_features)
clf = GridSearchCV(pipe,params)
clf.fit(X_train,y_train)
print('Best Criterion:',clf.best_estimator_.get_params()['decisiontree__criterion'])
print('Best Splitter:',clf.best_estimator_.get_params()['decisiontree__splitter'])
print('Best Depth:', clf.best_estimator_.get_params()['decisiontree__max_depth'])
print('Best Maximum Features:', clf.best_estimator_.get_params()['decisiontree__max_features'])
print(clf.best_estimator_.get_params()['decisiontree'])
ans = clf.score(X_test,y_test)*100
print("Best score using GridSearchCV on Decision Tree Model is:",ans)

# Random Forest Classifier
### In below block of code,Random Forest Classifier is used with various number of estimators.These estimators are checked and the estimator which gives best score for model is further selected for classification report.

In [None]:
rfc_scores = []
n_estimators = [10,50,100,150,200,300,400,450,500,950,1000]
for i in n_estimators:
    model_rfc = rfc(n_estimators = i,random_state=0)
    model_rfc.fit(X_train,y_train)
    rfc_scores.append(model_rfc.score(X_test,y_test)*100)
    ans = max(rfc_scores)
print("Maximum Score by using various number of estimators of Random Forest model in percentage is: ",ans)

cnt = 0
for i in rfc_scores:
    if i == ans:
        best_estimator = n_estimators[cnt]
        break
    else:
        cnt = cnt + 1

In [None]:
colors = rainbow(np.linspace(0, 1, len(n_estimators)))
plt.bar([i for i in range(len(n_estimators))], rfc_scores, color = colors, width = 0.8)
for i in range(len(n_estimators)):
    plt.text(i, rfc_scores[i], rfc_scores[i])
plt.xticks(ticks = [i for i in range(len(n_estimators))], labels = [str(estimator) for estimator in n_estimators])
plt.xlabel('Number of estimators')
plt.ylabel('Scores')
plt.title('Random Forest Classifier scores for different number of estimators')

In [None]:
model_rfc = rfc(n_estimators = best_estimator,random_state=0)
model_rfc.fit(X_train,y_train)
y_pred = model_rfc.predict(X_test)
target_names = ['class-0','class-1']
print(classification_report(y_test, y_pred, target_names=target_names))

# Grid Search CV for SVM and Random Forest
### Grid Search CV library is used here to select best parameters from both the SVM and Random Forest models and thereby calculating the score on test set by choosing those best parameters.

In [None]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto',probability=True),
        'params': {
            'svc__C': [1,10,100,1000],
            'svc__kernel': ['rbf','linear','poly','sigmoid']
        }  
    },
    'random_forest': {
        'model': rfc(),
        'params' : {
            'randomforestclassifier__n_estimators': [10,50,100,150,200,300,400,450,500,950,1000]
        }
    }
}

In [None]:
scores = []
best_estimators = {}
for algo, mp in model_params.items():
    pipe = make_pipeline(StandardScaler(), mp['model'])
    clf =  GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': algo,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_estimators[algo] = clf.best_estimator_
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

In [None]:
print("Scores after applying GridSearchCV on SVM model: ",best_estimators['svm'].score(X_test,y_test)*100)
print("Scores after applying GridSearchCV on Random Forest model: ",best_estimators['random_forest'].score(X_test,y_test)*100)