In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime as dt
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
colnames = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']
df = pd.read_csv('heart_disease.csv',names=colnames)

In [None]:
df.head()

We can see here that there are some '?' characters. We need to eliminate them.

In [None]:
df.info()

There are 200 rows, no null values. Object type column needs to be converted to int or float.

In [None]:
df [df== '?'].count()

In [None]:
df.describe()

# Data Visualization 

In [None]:
sns.countplot(x='sex',data=df)
plt.title("Gender Ratio")


In [None]:
sns.distplot(df['age'])
plt.title("Age Distribution")

In [None]:
plt.title("Chest Pain type")
sns.countplot(x='cp',data=df)

In [None]:
sns.heatmap(df.isnull())

# Data Preprocessing

In [None]:
#drop the following columns as they have too many ? char
df.drop('slope',axis=1,inplace=True)
df.drop('ca',axis=1,inplace=True)
df.drop('thal',axis=1,inplace=True)


In [None]:
#drop only those rows whose column value is ? char.
df.drop(df[df['chol'] == '?'].index,axis=0,inplace=True)
df.drop(df[df['fbs'] == '?'].index,axis=0,inplace=True)
df.drop(df[df['trestbps'] == '?'].index,axis=0,inplace=True)
df.drop(df[df['oldpeak'] == '?'].index,axis=0,inplace=True)

In [None]:
#convert the remaining column type to float.
convert_dtype = ['trestbps','chol','fbs','thalach','exang','oldpeak']
for i in range(len(convert_dtype)):
    df[convert_dtype[i]] = df[convert_dtype[i]].astype('float')

In [None]:
df.info()

In [None]:
#correlation:

corr = df.corr()
plt.figure(figsize=(10,5))
sns.heatmap(corr,annot=True)

  Observation :   
     
      1. We can see that age and oldpeak column is highly correlated to the target variable num.
      2. Columns - restecg and thalach has negative correlation with target value.

In [None]:
#We can see how data is distributed based on gender.
sns.pairplot(df,height=1.5,hue='sex')

In [None]:
#To check for skewness :
col = df.columns.values
plt.figure(figsize=(20,35))
for i in range(0,len(col)):
    plt.subplot(10,5,i+1)
    sns.distplot(df[col[i]],color='crimson')
plt.show()

       Note : Sex and cp column, is left skewed.

In [None]:
#Check for outliers :
plt.figure(figsize=(8,20))
for i in range(0,len(col)):
    plt.subplot(10,5,i+1)
    sns.boxplot(df[col[i]],palette='rocket',orient='v')
    plt.tight_layout()


      Obs : Age,sex,cp,and trestbps has some outliers.

In [None]:
#Use z-score to find outliers :
from scipy.stats import zscore
#data.info()
z=np.abs(zscore(df))
print(np.where(z>3))

In [None]:
#remove the outliers and store it a new variable.
df_new = df[(z<3).all(axis=1)]

In [None]:
#Feature Selection :
from sklearn.model_selection import train_test_split

X = df_new.drop('num',axis=1)
y = df_new.iloc[:,-1:]

In [None]:
#Check for skew values : if skewness is > 0.50, skewness needs to be removed.
X.skew()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))

# Model Validation and Prediction

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
# algorithm to calculate the random state where models give better score :
def calBestRandomStateOf(model):
    max_score=0
    for i in range(40,200):
        x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=i)
        model.fit(x_train,y_train)
        pred = model.predict(x_test)
        score = accuracy_score(y_test,pred)
        if score>max_score:
            max_score = score
            final_state = i

    return final_state


In [None]:
#check for various models and its scores :
model = [DecisionTreeClassifier(),KNeighborsClassifier(),SVC()]

for i in range(len(model)):
    state = calBestRandomStateOf(model[i])
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=state)
    model[i].fit(x_train,y_train)
    score = model[i].score(x_train,y_train)
    y_pred = model[i].predict(x_test)
    accuracy = accuracy_score(y_test,y_pred)
    classificationReport = classification_report(y_test,y_pred)
    confusionMatrix = confusion_matrix(y_test,y_pred)
    
    print("*************************************************************************************")
    print("Random State : ",state)
    print("Score of ",model[i]," is : ", score)
    print("Accuracy : ",accuracy*100,"% ")
    print("Classification Report : \n")
    print(classificationReport)
    print("Confusion Matrix :\n ")
    print(confusionMatrix)
    print("*************************************************************************************")



    We can say that SVC scores better than the rest of the model with 68% accuracy.

In [None]:
#Cross Validation :
from sklearn.model_selection import cross_val_score

model = [DecisionTreeClassifier(),KNeighborsClassifier(),SVC()]
for i in range(len(model)):
    cv_score = cross_val_score(model[i],X,y,cv=4,scoring='accuracy')
    print("*************************************************************************************")
    print("Score for ",model[i]," : ")
    print("Score : ", cv_score)
    print("Mean : ", cv_score.mean())
    print("Standard Deviation : ", cv_score.std())
    print("*************************************************************************************")
    print("")


In [None]:
#GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10, 100],  
              'gamma': [10,1, 0.1, 0.01, 0.001], 
              'kernel': ['linear','rbf','radial']
              } 
gridsearch = GridSearchCV(SVC(),param_grid,n_jobs=-1,pre_dispatch=2)
gridsearch.fit(X,y)
gridsearch.best_params_
                          

In [None]:
#Using the best parameters, build a svc model:

from sklearn.metrics import mean_absolute_error, mean_squared_error

svc = SVC(C=100,gamma=0.1,kernel='rbf')
state = calBestRandomStateOf(svc)
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state = state )

svc.fit(x_train,y_train)
score = svc.score(x_train,y_train)
y_pred = svc.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)

print("Score for SVC : ",score)
print("Accuracy Score for SVC : ",accuracy)
print("Mean Squared Error : " ,mean_squared_error(y_test,y_pred))
print("Mean Absolute Error : " ,mean_absolute_error(y_test,y_pred))
print("Root Mean Squared Error : " ,np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
#K-fold Cross Validation
cv_score = cross_val_score(svc,X,y,cv=4,scoring='accuracy')
print("*************************************************************************************")
print("Score for ",svc," : ")
print("Score : ", cv_score)
print("Mean : ", cv_score.mean())
print("Standard Deviation : ", cv_score.std())
print("*************************************************************************************")
print("")

In [None]:
#Boost scores using Adaboost Regressor, Gradient Regressor and RandomforestRegressor
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier as RFC

ada = ABC(n_estimators=20,random_state=41)
gradient = GBC(n_estimators=20,random_state=162)
rfc = RFC(n_estimators=20,random_state=115)

boosting_model = [ada,gradient,rfc]

for i in range(len(boosting_model)):
    boost = boosting_model[i]
    boost.fit(x_train,y_train)
    pred = boost.predict(x_test)
    accuracyScore = accuracy_score(y_test,pred)
    print("-----------------------------------------------------------")
    print(boost)
    print("-----------------------------------------------------------")
    print("Accuracy Score : ", accuracyScore)
    print("\n")

    Out of all the models, GradientBoostClassifier performed best. Hence we will select this as our final model.

In [None]:
#Final Model :
gradient = GBC(n_estimators=20,random_state=162)
gradient.fit(x_train,y_train)
y_pred= gradient.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy Score for SVC : ",accuracy)
print("Mean Absolute Error : " ,mean_absolute_error(y_test,y_pred))
print("Mean Squared Error : " ,mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error : " ,np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
#save the model 

from sklearn.externals import joblib

joblib.dump(gradient,'gradientmodel.obj')

gradient_from_joblib = joblib.load('gradientmodel.obj')

final_output = gradient_from_joblib.predict(x_test)

In [None]:
#save final output to a csv file :
pd.DataFrame(final_output).to_csv("HeartDisease_Output.csv")