In [29]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image

In [3]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/')
base_data = pd.read_csv("DSP_2.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
base_data.isnull().any()

Age               False
Sex               False
ChestPainType     False
RestingBP         False
Cholesterol       False
FastingBS         False
RestingECG        False
MaxHR             False
ExerciseAngina    False
Oldpeak           False
ST_Slope          False
HeartDisease      False
dtype: bool

In [None]:
cols = ["Age", "Sex", "ChestPainType", "RestingBP", "Cholesterol", "FastingBS", "RestingECG", "MaxHR","ExerciseAngina","Oldpeak","ST_Slope","HeartDisease"]
data = base_data[cols].copy()
data.isnull().any()

In [9]:
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [10]:
encoder = LabelEncoder()
data.loc[:,"Sex"] = encoder.fit_transform(data.loc[:,"Sex"])
data.loc[:,"ChestPainType"] = encoder.fit_transform(data.loc[:,"ChestPainType"])
data.loc[:,"RestingECG"] = encoder.fit_transform(data.loc[:,"RestingECG"])
data.loc[:,"ExerciseAngina"] = encoder.fit_transform(data.loc[:,"ExerciseAngina"])
data.loc[:,"ST_Slope"] = encoder.fit_transform(data.loc[:,"ST_Slope"])

In [20]:
y = data.iloc[:,[-1]] # HeartDisease - zmienna, którą będziemy chcieli przewidzieć
x = data.iloc[:,:len(cols)-1] # zmienne na podstawie, których chcemy przewidzieć
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [22]:
def model(X_train, y_train):
    
    forest = RandomForestClassifier (n_estimators=20, random_state=0)
    forest.fit(X_train,y_train)
    print("Random Forest: {:.2f}".format(forest.score(X_train,y_train)))
    
    lreg = LogisticRegression(max_iter = 1000) # tworzymy regresję logistyczną znaną z poprzednich zajęć
    lreg.fit(X_train,y_train)
    print("Regresja logistyczna: {:.2f}".format(lreg.score(X_train,y_train)))
  
    tree = DecisionTreeClassifier()
    tree.fit(X_train,y_train)
    print("Decision Tree: {:.2f}".format(tree.score(X_train,y_train)))
   
    return forest, lreg, tree

forest, lreg, tree = model(X_train,y_train)

  after removing the cwd from sys.path.
  y = column_or_1d(y, warn=True)


Random Forest: 1.00
Regresja logistyczna: 0.86
Decision Tree: 1.00


In [25]:
y1_predict = forest.predict(X_test)
print("Random Forest {:.5f}".format(accuracy_score(y_test, y1_predict)))

y2_predict = lreg.predict(X_test)
print("Logistic Regresion {:.5f}".format(accuracy_score(y_test, y2_predict)))

y3_predict = tree.predict(X_test)
print("Decision Tree {:.5f}".format(accuracy_score(y_test, y3_predict)))

Random Forest 0.85870
Logistic Regresion 0.83696
Decision Tree 0.75543


In [26]:
print("Ocena modelu 1. Random Forest")
print(classification_report(y_test,y1_predict))

print("Ocena modelu 2. Regresja logistyczna")
print(classification_report(y_test,y2_predict))

print("Ocena modelu 3. Decision Tree")
print(classification_report(y_test,y3_predict))

Ocena modelu 1. Random Forest
              precision    recall  f1-score   support

           0       0.84      0.82      0.83        77
           1       0.87      0.89      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.85      0.85       184
weighted avg       0.86      0.86      0.86       184

Ocena modelu 2. Regresja logistyczna
              precision    recall  f1-score   support

           0       0.83      0.77      0.80        77
           1       0.84      0.89      0.86       107

    accuracy                           0.84       184
   macro avg       0.84      0.83      0.83       184
weighted avg       0.84      0.84      0.84       184

Ocena modelu 3. Decision Tree
              precision    recall  f1-score   support

           0       0.67      0.82      0.74        77
           1       0.84      0.71      0.77       107

    accuracy                           0.76       184
   macro avg       0.76      0.

In [38]:
estimator = forest.estimators_[5]
cols_2 = ["Age", "Sex", "ChestPainType", "RestingBP", "Cholesterol", "FastingBS", "RestingECG", "MaxHR","ExerciseAngina","Oldpeak", "ST_Slope"]
export_graphviz(estimator, out_file='tree.dot',
                feature_names = cols_2,
                class_names = "HeartDisease",
                rounded = True, proportion = False, 
                precision = 2, filled = True)

In [39]:
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

0

In [37]:
Image(filename = 'tree.png')


Output hidden; open in https://colab.research.google.com to view.