In [138]:
# pip install xgboost

In [139]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import confusion_matrix

In [166]:
df = pd.read_csv('heart.csv')

In [167]:
df.sample(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
726,41,M,ATA,110,235,0,Normal,153,N,0.0,Up,0
742,52,F,NAP,136,196,0,LVH,169,N,0.1,Flat,0
323,62,M,ASY,115,0,1,Normal,128,Y,2.5,Down,1
171,40,M,NAP,140,235,0,Normal,188,N,0.0,Up,0
507,40,M,NAP,106,240,0,Normal,80,Y,0.0,Up,0


In [168]:
df.shape

(918, 12)

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [170]:
df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [171]:
cat_cols = ['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope']

In [172]:
encoder = LabelEncoder()
for column in cat_cols:
    df[column] = encoder.fit_transform(df[column])

In [173]:
df.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [174]:
df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    int32  
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    int32  
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    int32  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int32  
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int32(5), int64(6)
memory usage: 68.3 KB


In [178]:
df.to_csv('Encoded_dataset.csv')

In [176]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns = ['HeartDisease'] , axis = 1) , df['HeartDisease'] , test_size = 0.2 , random_state=10)
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm = np.array(cm)

# Calculate accuracy
accuracy = (cm[0][0] + cm[1][1]) / np.sum(cm)

# Calculate precision
precision = cm[0][0] / (cm[0][0] + cm[0][1])

# Calculate recall
recall = cm[0][0] / (cm[0][0] + cm[1][0])

# Calculate F1 score
f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1_score)

print(cm)

Accuracy: 0.8478260869565217
Precision: 0.813953488372093
Recall: 0.8536585365853658
F1 score: 0.8333333333333333
[[70 16]
 [12 86]]


In [151]:
acc = []
pre = []
re = []
f1 = []

for i in range(0,1000):
    x_train,x_test,y_train,y_test = train_test_split(df.drop(columns = ['HeartDisease'] , axis = 1) , df['HeartDisease'] , test_size = 0.2 , random_state=i)
    clf = xgb.XGBClassifier()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    cm = confusion_matrix(y_test, y_pred)
    cm = np.array(cm)

    # Calculate accuracy
    accuracy = (cm[0][0] + cm[1][1]) / np.sum(cm)
    
    # Calculate precision
    precision = cm[0][0] / (cm[0][0] + cm[0][1])

    # Calculate recall
    recall = cm[0][0] / (cm[0][0] + cm[1][0])

    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    acc.append(accuracy)
    pre.append(precision)
    re.append(recall)
    f1.append(f1_score)

print('Max Accuracy',max(acc), 'and index in list is ',acc.index(max(acc)))
print('Max Precision',max(pre), 'and index in list is ',pre.index(max(pre)))
print('Max Recall',max(re), 'and index in list is ',re.index(max(re)))
print('Max f1 score',max(f1), 'and index in list is ',f1.index(max(f1)))

Max Accuracy 0.9402173913043478 and index in list is  67
Max Precision 0.9444444444444444 and index in list is  390
Max Recall 0.9743589743589743 and index in list is  67
Max f1 score 0.9325153374233129 and index in list is  67


In [177]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns = ['HeartDisease'] , axis = 1) , df['HeartDisease'] , test_size = 0.2 , random_state=390)
tf1 = ColumnTransformer([('Standard Scaler', StandardScaler(), slice(0,11))])
clf = xgb.XGBClassifier()
pipe = Pipeline([('StandardScaler',tf1),('XgBoost Classifier Model',clf)])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm = np.array(cm)

# Calculate accuracy
accuracy = (cm[0][0] + cm[1][1]) / np.sum(cm)

# Calculate precision
precision = cm[0][0] / (cm[0][0] + cm[0][1])

# Calculate recall
recall = cm[0][0] / (cm[0][0] + cm[1][0])

# Calculate F1 score
f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1_score)

print(cm)

Accuracy: 0.8967391304347826
Precision: 0.9444444444444444
Recall: 0.8192771084337349
F1 score: 0.8774193548387097
[[68  4]
 [15 97]]


In [164]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns = ['HeartDisease'] , axis = 1) , df['HeartDisease'] , test_size = 0.2 , random_state=390)
clf = xgb.XGBClassifier()
pipe = Pipeline([('XgBoost Classifier Model',clf)])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm = np.array(cm)

# Calculate accuracy
accuracy = (cm[0][0] + cm[1][1]) / np.sum(cm)

# Calculate precision
precision = cm[0][0] / (cm[0][0] + cm[0][1])

# Calculate recall
recall = cm[0][0] / (cm[0][0] + cm[1][0])

# Calculate F1 score
f1_score = 2 * (precision * recall) / (precision + recall)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1_score)

print(cm)

Accuracy: 0.8967391304347826
Precision: 0.9444444444444444
Recall: 0.8192771084337349
F1 score: 0.8774193548387097
[[68  4]
 [15 97]]


# -----------------------------------------------------------------------------------------------------------

In [237]:
def Model(cols):

    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    import xgboost as xgb
    from sklearn.metrics import confusion_matrix

    df = pd.read_csv('Encoded_dataset.csv')
    df.drop(['Unnamed: 0'],axis=1,inplace=True)
    
    cols_num = []
    for i in range (len(cols)):
        if cols[i] == 1:
            cols_num.append(i)
    cols_num.append(11)
        
    df = df.iloc[:,cols_num]
    
    x_train,x_test,y_train,y_test = train_test_split(df.drop(columns = ['HeartDisease'] , axis = 1) , df['HeartDisease'] , test_size = 0.2 , random_state=390)
    clf = xgb.XGBClassifier()
    pipe = Pipeline([('XgBoost Classifier Model',clf)])
    pipe.fit(x_train, y_train)i
    y_pred = pipe.predict(x_test)
    cm = confusion_matrix(y_test, y_pred)
    cm = np.array(cm)

    # Calculate accuracy
    accuracy = (cm[0][0] + cm[1][1]) / np.sum(cm)

    # Calculate precision
    precision = cm[0][0] / (cm[0][0] + cm[0][1])

    # Calculate recall
    recall = cm[0][0] / (cm[0][0] + cm[1][0])

    # Calculate F1 score
    f1_score = 2 * (precision * recall) / (precision + recall)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 score:", f1_score)

    print(cm)
    return precision

In [240]:
cols = [0,1,1,1,0,1,0,1,0,1,1]
x = Model(cols)
print(x)

Accuracy: 0.8804347826086957
Precision: 0.9166666666666666
Recall: 0.8048780487804879
F1 score: 0.8571428571428571
[[66  6]
 [16 96]]
0.9166666666666666
