
* Age
* Gender
* Myocardial:  Quantity of myocardial infarctions in the anamnesis – Ordinal
* Exertional angina: Exertional angina pectoris in the anamnesis
* FC: Functional class (FC) of angina pectoris in the last year – Ordinal
* Heart Disease: Coronary heart disease (CHD) in recent weeks, days before admission to hospital
* Heredity: Heredity on CHD
* Hypertension: Presence of an essential hypertension
* Symptomatic hypertension
* Duration: Duration of arterial hypertension
* Arrhythmia: Observing of arrhythmia in the anamnesis
* Systolic_emergency: Systolic blood pressure according to Emergency Cardiology Team
* Diastolic_emergency: Diastolic blood pressure according to Emergency Cardiology Team
* Systolic_intensive_care: Systolic blood pressure according to intensive care unit
* Diastolic_intensive_care: Diastolic blood pressure according to intensive care unit
* Potassium: Serum potassium content
* Sodium: Serum sodium content
* AlAT: Serum AlAT content
* AsTK: Serum AsTK content
* WBC: White Blood Cell Count
* ESR: Erythrocyte sedimentation rate
* Time: Time elapsed from the beginning of the attack of CHD to the hospital
* Outcome: target column

# Preprocessing ``train.csv``

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

df = pd.read_csv('train.csv')


df.replace('?', np.nan, inplace = True)

df.drop(['Exertional angina','Heredity','Duration','Systolic_emergency','Diastolic_emergency'],axis=1,inplace=True)



df['Systolic_intensive_care']=df['Systolic_intensive_care'].astype(float)
df['Systolic_intensive_care']=df['Systolic_intensive_care'].fillna(df['Systolic_intensive_care'].mean())

df['Diastolic_intensive_care']=df['Diastolic_intensive_care'].astype(float)
df['Diastolic_intensive_care']=df['Diastolic_intensive_care'].fillna(df['Diastolic_intensive_care'].mean())

df['Potassium']=df['Potassium'].astype(float)
df['Potassium']=df['Potassium'].fillna(df['Potassium'].mean())

df['AlAT']=df['AlAT'].astype(float)
df['AlAT']=df['AlAT'].fillna(df['AlAT'].mean())

df['AsAT']=df['AsAT'].astype(float)
df['AsAT']=df['AsAT'].fillna(df['AsAT'].mean())

df['ESR']=df['ESR'].astype(float)
df['ESR']=df['ESR'].fillna(df['ESR'].mean())

df['Time']=df['Time'].astype(float)
df['Time']=df['Time'].fillna(df['Time'].mean())

df['myocardial']=df['myocardial'].astype(float)
df['myocardial']=df['myocardial'].fillna(df['myocardial'].mean())

df['Age']=df['Age'].astype(float)
df['Age']=df['Age'].fillna(df['Age'].mean())

df['Sodium']=df['Sodium'].astype(float)
df['Sodium']=df['Sodium'].fillna(df['Sodium'].mean())

df['WBC']=df['WBC'].astype(float)
df['WBC']=df['WBC'].fillna(df['WBC'].mean())

df['Hypertension']=df['Hypertension'].fillna(df['Hypertension'].mode()[0])
df['FC']=df['FC'].fillna(df['FC'].mode()[0])
df['Heart Disease']=df['Heart Disease'].fillna(df['Heart Disease'].mode()[0])
df['Symptomatic hypertension']=df['Symptomatic hypertension'].fillna(df['Symptomatic hypertension'].mode()[0])
df['Arrhythmia']=df['Arrhythmia'].fillna(df['Arrhythmia'].mode()[0])


cols = pd.get_dummies(df['Hypertension'])
df[cols.columns] = cols
df.drop('Hypertension', axis = 1, inplace = True)

cols = pd.get_dummies(df['FC'])
df[cols.columns] = cols
df.drop('FC', axis = 1, inplace = True)

cols = pd.get_dummies(df['Heart Disease'])
df[cols.columns] = cols
df.drop('Heart Disease', axis = 1, inplace = True)

cols = pd.get_dummies(df['Symptomatic hypertension'])
df[cols.columns] = cols
df.drop('Symptomatic hypertension', axis = 1, inplace = True)

cols = pd.get_dummies(df['Arrhythmia'])
df[cols.columns] = cols
df.drop('Arrhythmia', axis = 1, inplace = True)

cols = pd.get_dummies(df['Gender'])
df[cols.columns] = cols
df.drop('Gender', axis = 1, inplace = True)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

y = df['Outcome']
X = df.drop('Outcome', axis =1)


x_train_orginal, x_test_orginal, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)
scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train_orginal)
x_test = scaler.fit_transform(x_test_orginal)




# Machine learning models 

In [2]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=17) 

#TRAINING THE MODEL
model.fit(x_train,y_train)

#TESTING THE MODEL
confidence = model.score(x_test,y_test)


print(confidence)



0.7803921568627451


In [3]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier() 

#TRAINING THE MODEL
model.fit(x_train,y_train)

#TESTING THE MODEL
confidence = model.score(x_test,y_test)


print(confidence)



0.7176470588235294


In [69]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
param_grid = {'C': [10,100,1000,100000],
              'gamma': [0.1, 1, 10, ]}

grid_search = GridSearchCV(SVC(kernel = 'rbf'), param_grid, cv=3, return_train_score=True, n_jobs = -1)
grid_search.fit(x_train, y_train)
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

svc_rbf = SVC(kernel = 'rbf', C = 100, gamma = 1)
svc_rbf.fit(x_train, y_train)
print('Train  score: %.4f'%precision_score(y_train, svc_rbf.predict(x_train), average = 'macro'))
print('Test score: %.4f'%precision_score(y_test, svc_rbf.predict(x_test), average = 'macro'))




Best parameters: {'C': 10, 'gamma': 0.1}
Best cross-validation score: 0.86
Train  score: 0.9960
Test score: 0.5764


In [68]:
svc_linear = SVC(kernel = 'linear', C = 10, gamma = 0.1)
svc_linear.fit(x_train, y_train)
print('Train score: %.4f'%precision_score(y_train, svc_linear.predict(x_train), average = 'macro'))
print('Test score: %.4f'%precision_score(y_test, svc_linear.predict(x_test), average = 'macro'))

Train score: 0.8990
Test score: 0.8953


In [60]:
#Bagging - Logistic 
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression


log_clf = LogisticRegression() 

bag_clf = BaggingClassifier(log_clf, n_estimators=500, max_samples=100, bootstrap=True, random_state=42, oob_score = True)
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)

print('Train  score: %.4f'%precision_score(y_train, bag_clf.predict(x_train), average = 'macro'))
print('Test score: %.4f'%precision_score(y_test, bag_clf.predict(x_test), average = 'macro'))
print('Out-of-bag score: %.4f'%bag_clf.oob_score_)

Train  score: 0.9269
Test score: 0.3922
Out-of-bag score: 0.8539


  _warn_prf(average, modifier, msg_start, len(result))


In [59]:

#pasting
bag_clf = BaggingClassifier(log_clf, n_estimators=500, max_samples=100, bootstrap=False, random_state=42)
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)

print('Train score: %.4f'%precision_score(y_train, bag_clf.predict(x_train), average = 'macro'))
print('Test score: %.4f'%precision_score(y_test, bag_clf.predict(x_test), average = 'macro'))

Train score: 0.9269
Test score: 0.8937


In [58]:
from sklearn.ensemble import AdaBoostClassifier
dtree_clf = DecisionTreeClassifier(random_state=42,max_depth=10)
ab =AdaBoostClassifier(n_estimators=50, base_estimator=dtree_clf,learning_rate=1, random_state=42)
ab_model = ab.fit(x_train, y_train)
y_pred = ab_model.predict(x_test)

print('Train score: %.4f'%precision_score(y_train, ab_model.predict(x_train), average = 'macro'))
print('Test score: %.4f'%precision_score(y_test, ab_model.predict(x_test), average = 'macro'))

Train score: 1.0000
Test score: 0.6081


In [67]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(max_depth=2, n_estimators=100, learning_rate=0.1, random_state=42)
clf.fit(x_train, y_train)
print('Train score: %.4f'%precision_score(y_train, clf.predict(x_train), average = 'macro'))
print('Test score: %.4f'%precision_score(y_test, clf.predict(x_test), average = 'macro'))

Train score: 0.9077
Test score: 0.6678


In [44]:
from sklearn.linear_model import LogisticRegression


model = LogisticRegression() 

#TRAINING THE MODEL
model.fit(x_train,y_train)

#TESTING THE MODEL
confidence = model.score(x_test,y_test)


print(confidence)




0.8117647058823529


# Preprocessing ``test.csv`` 

In [66]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

df = pd.read_csv('test.csv')



df.replace('?', np.nan, inplace = True)

df.drop(['Exertional angina','Heredity','Duration','Systolic_emergency','Diastolic_emergency'],axis=1,inplace=True)



df['Systolic_intensive_care']=df['Systolic_intensive_care'].astype(float)
df['Systolic_intensive_care']=df['Systolic_intensive_care'].fillna(df['Systolic_intensive_care'].mean())

df['Diastolic_intensive_care']=df['Diastolic_intensive_care'].astype(float)
df['Diastolic_intensive_care']=df['Diastolic_intensive_care'].fillna(df['Diastolic_intensive_care'].mean())

df['Potassium']=df['Potassium'].astype(float)
df['Potassium']=df['Potassium'].fillna(df['Potassium'].mean())

df['AlAT']=df['AlAT'].astype(float)
df['AlAT']=df['AlAT'].fillna(df['AlAT'].mean())

df['AsAT']=df['AsAT'].astype(float)
df['AsAT']=df['AsAT'].fillna(df['AsAT'].mean())

df['ESR']=df['ESR'].astype(float)
df['ESR']=df['ESR'].fillna(df['ESR'].mean())

df['Time']=df['Time'].astype(float)
df['Time']=df['Time'].fillna(df['Time'].mean())

df['myocardial']=df['myocardial'].astype(float)
df['myocardial']=df['myocardial'].fillna(df['myocardial'].mean())

df['Age']=df['Age'].astype(float)
df['Age']=df['Age'].fillna(df['Age'].mean())

df['Sodium']=df['Sodium'].astype(float)
df['Sodium']=df['Sodium'].fillna(df['Sodium'].mean())

df['WBC']=df['WBC'].astype(float)
df['WBC']=df['WBC'].fillna(df['WBC'].mean())

df['Hypertension']=df['Hypertension'].fillna(df['Hypertension'].mode()[0])
df['FC']=df['FC'].fillna(df['FC'].mode()[0])
df['Heart Disease']=df['Heart Disease'].fillna(df['Heart Disease'].mode()[0])
df['Symptomatic hypertension']=df['Symptomatic hypertension'].fillna(df['Symptomatic hypertension'].mode()[0])
df['Arrhythmia']=df['Arrhythmia'].fillna(df['Arrhythmia'].mode()[0])


cols = pd.get_dummies(df['Hypertension'])
df[cols.columns] = cols
df.drop('Hypertension', axis = 1, inplace = True)

cols = pd.get_dummies(df['FC'])
df[cols.columns] = cols
df.drop('FC', axis = 1, inplace = True)

cols = pd.get_dummies(df['Heart Disease'])
df[cols.columns] = cols
df.drop('Heart Disease', axis = 1, inplace = True)

cols = pd.get_dummies(df['Symptomatic hypertension'])
df[cols.columns] = cols
df.drop('Symptomatic hypertension', axis = 1, inplace = True)

cols = pd.get_dummies(df['Arrhythmia'])
df[cols.columns] = cols
df.drop('Arrhythmia', axis = 1, inplace = True)

cols = pd.get_dummies(df['Gender'])
df[cols.columns] = cols
df.drop('Gender', axis = 1, inplace = True)

df_test = scaler.transform(df)


# Best model

In [74]:
#logistic regression with pasting
#in dataset our desired output is binary since logistic regression is good at binary classification with pasting classifier has been optimzed

bag_clf1 = BaggingClassifier(log_clf, n_estimators=500, max_samples=100, bootstrap=False, random_state=42)
bag_clf1.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)

print('Train score: %.4f'%precision_score(y_train, bag_clf.predict(x_train), average = 'macro'))
print('Test score: %.4f'%precision_score(y_test, bag_clf.predict(x_test), average = 'macro'))

#support vector classifier 
#performed as good as logistic regression
svc_linear = SVC(kernel = 'linear', C = 10, gamma = 0.1)
svc_linear.fit(x_train, y_train)
print('Train score: %.4f'%precision_score(y_train, svc_linear.predict(x_train), average = 'macro'))
print('Test score: %.4f'%precision_score(y_test, svc_linear.predict(x_test), average = 'macro'))

Train score: 0.9269
Test score: 0.8937
Train score: 0.8990
Test score: 0.8953


# test.csv prediction

In [79]:
final_test_prediction = svc_linear.predict(df_test)
final_test_prediction

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,