In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from sklearn.naive_bayes import GaussianNB

In [58]:
df=pd.read_csv("Dataset3.csv")
# filling missing data
column_names=df.columns 
for a in column_names:
    df[a]=df[a].replace('?', np.NaN)    
df= df.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [59]:
num_data = df.select_dtypes(include=['int64'])
cat_data = df.select_dtypes(include=['object'])

num_columns = num_data.columns
cat_columns = cat_data.columns

print("Numerical Vaiables: ", num_columns)
print("Categorical Vaiables: ", cat_columns)

Numerical Vaiables:  Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'slope', 'ca', 'thal', 'disease'],
      dtype='object')
Categorical Vaiables:  Index([], dtype='object')


In [60]:
# for c in df.columns:
#     print ("---- %s ---" % c)
#     print (df[c].value_counts())

In [61]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
count,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0
mean,54.797521,0.681818,1.016529,131.921488,247.334711,0.136364,0.53719,150.628099,0.322314,0.997934,1.421488,0.739669,2.322314,0.570248
std,8.938489,0.466736,1.026483,17.557199,54.119351,0.343886,0.53183,22.195709,0.468331,1.178629,0.62123,1.027827,0.586358,0.496067
min,34.0,0.0,0.0,94.0,126.0,0.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,139.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,243.0,0.0,1.0,154.5,0.0,0.6,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.75,0.0,1.0,167.75,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,192.0,564.0,1.0,2.0,195.0,1.0,6.2,2.0,4.0,3.0,1.0


In [62]:
def oldpeak_group(x):
    x = int(x)
    if( x == 0 ):
        return "0"
    elif(  x ==1 ):
        return "1"
    elif( x==2 ):
        return "2"
    elif(x ==3):
        return "3"
    if( x == 4 ):
        return "4"
    elif(  x ==5 ):
        return "5"
    else:
        return "6"

In [63]:
def thalach_group(x):
    x = int(x)
    if( 70 < x < 101 ):
        return "71-100"
    elif( 100 < x < 126 ):
        return "101-125"
    elif( 125< x < 151 ):
        return "126-150"
    elif( 150 < x < 176):
        return "151-175"
    else:
        return ">175"

In [64]:
def trestbps_group(x):
    x = int(x)
    if( 90 < x < 111 ):
        return "91-110"
    elif( 110 < x < 131 ):
        return "111-130"
    elif( 130< x < 151 ):
        return "131-150"
    elif( 150 < x < 171):
        return "151-170"
    else:
        return ">170"

In [65]:
def age_group(x):
    x = int(x)
    if( 30 < x < 41 ):
        return "31-40"
    elif( 40 < x < 51 ):
        return "41-50"
    elif( 50 < x < 56 ):
        return "51-55"
    elif( 55 < x < 61 ):
        return "56-60"
    elif( 60 < x < 71 ):
        return "61-70"
    else:
        return ">70"

In [66]:
def chol_group(x):
    x = int(x)
    if( 100 < x < 201 ):
        return "101-200"
    elif( 200 < x < 301 ):
        return "201-300"
    elif( 300< x < 401 ):
        return "301-400"
    elif( 400 < x < 501):
        return "401-500"
    else:
        return ">500"

In [67]:
def pre_processing (df):
    df['chol-group'] = df['chol'].apply(chol_group)
    df['age-group'] = df['age'].apply(age_group)    
    df['trestbps-group'] = df['trestbps'].apply(trestbps_group) 
    df['thalach-group'] = df['thalach'].apply(thalach_group) 
    df['oldpeak-group'] = df['oldpeak'].apply(oldpeak_group) 
    
    df = df.drop(['chol'], axis = 1)
    df = df.drop(['age'], axis = 1)
    df = df.drop(['trestbps'], axis = 1)
    df = df.drop(['thalach'], axis = 1)
    df = df.drop(['oldpeak'], axis = 1)

    return df

In [68]:
def mapping (df, flag):
    df['sex'] = df['sex'].astype(int)
    df['cp'] = df['cp'].astype(int)
    df['fbs'] = df['fbs'].astype(int)
    df['restecg'] = df['restecg'].astype(int)
    df['exang'] = df['exang'].astype(int)
    df['slope'] = df['slope'].astype(int)
    df['ca'] = df['ca'].astype(int)
    df['thal'] = df['thal'].astype(int)
    if flag == 0:
        df['disease'] = df['disease'].astype(int)
    df['oldpeak-group'] = df['oldpeak-group'].astype(int)
    df['age-group'] = df['age-group'].map({'31-40': 0, '41-50': 1, '51-55': 2, '56-60': 3, '61-70': 4, '>70': 5}).astype(int)
    df['chol-group'] = df['chol-group'].map({'101-200': 0, '201-300': 1,'301-400': 2, '401-500': 3, '>500': 4}).astype(int)
    df['trestbps-group'] = df['trestbps-group'].map({'91-110': 0, '111-130': 1,'131-150': 2, '151-170': 3, '>170': 4}).astype(int)
    df['thalach-group'] = df['thalach-group'].map({'71-100': 0, '101-125': 1,'126-150': 2, '151-175': 3, '>175': 4}).astype(int)

    return df

In [69]:
df = pre_processing(df)
df = mapping (df , 0)


In [70]:
X = df.drop('disease', axis=1)
y = df['disease']
# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [71]:
data_clf = GaussianNB()
data_clf.fit(X_train,y_train)
y_pred = data_clf.predict(X_test)
print("Accuracy is ", accuracy_score(y_test,y_pred)*100)

Accuracy is  81.63265306122449


In [72]:
df_test=pd.read_csv("Dataset3_Unknown.csv")
# filling missing data
column_names=df_test.columns 
for a in column_names:
    df_test[a]=df_test[a].replace('?', np.NaN)
    
df_test= df_test.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [73]:
df_test = pre_processing(df_test)
df_test = mapping(df_test ,1 )

In [74]:
y_pred_test=data_clf.predict(df_test)

In [75]:
np.savetxt("bayes_predict.csv", y_pred_test, delimiter=",")
