In this kernel, we have implemented a multi-class decision tree algorithm to classify diseases using the annotation files. These contain information on the number of crackles and wheezes in each recording. 

We first start with rearding the patient diagnosis and demographic info files.

In [2]:
# Import libraries
import numpy as np
import pandas as pd

In [3]:
# Reading the data
df = pd.read_csv('la2_rankIIIonly_extended.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11895 entries, 0 to 11894
Data columns (total 48 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gameId                       11895 non-null  int64 
 1   region                       11895 non-null  object
 2   summonerName                 11895 non-null  object
 3   tier                         11895 non-null  object
 4   rank                         11895 non-null  object
 5   tierRank                     11895 non-null  object
 6   wins                         11895 non-null  int64 
 7   losses                       11895 non-null  int64 
 8   win                          11895 non-null  bool  
 9   lane                         11895 non-null  object
 10  role                         11895 non-null  object
 11  championId                   11895 non-null  int64 
 12  spell1Id                     11895 non-null  int64 
 13  spell2Id                     11

# Handling Missing Data for Decision Tree Analysis

In [4]:
# Review missing data
print(df.isnull().sum())

gameId                         0
region                         0
summonerName                   0
tier                           0
rank                           0
tierRank                       0
wins                           0
losses                         0
win                            0
lane                           0
role                           0
championId                     0
spell1Id                       0
spell2Id                       0
kills                          0
deaths                         0
assists                        0
largestKillingSpree            0
largestMultiKill               0
killingSprees                  0
longestTimeSpentLiving         0
doubleKills                    0
tripleKills                    0
quadraKills                    0
pentaKills                     0
totalDamageDealt               0
totalDamageDealtToChampions    0
totalHeal                      0
totalUnitsHealed               0
damageDealtToObjectives        0
timeCCingO

In [5]:
# Eliminate rows with missing values
df.dropna(inplace=True)

# Multi-class Decision Tree

In [6]:
print(df.head())

       gameId region summonerName  tier rank  tierRank  wins  losses    win  \
0  1008260800    la2  starboy2003  IRON  III  IRON-III     3       8   True   
1  1008218594    la2  starboy2003  IRON  III  IRON-III     3       8  False   
2  1007865682    la2  starboy2003  IRON  III  IRON-III     3       8  False   
3  1007660020    la2  starboy2003  IRON  III  IRON-III     3       8  False   
4  1007272625    la2  starboy2003  IRON  III  IRON-III     3       8  False   

     lane  ... team-firstInhibitor  team-firstBaron  team-firstDragon  \
0  BOTTOM  ...                True             True             False   
1  BOTTOM  ...               False            False              True   
2  BOTTOM  ...               False            False              True   
3  BOTTOM  ...               False            False             False   
4  BOTTOM  ...               False            False             False   

   team-firstRiftHerald  team-towerKills  team-inhibitorKills  \
0                  Tr

In [15]:
X = df.drop(['tier', 'rank', 'tierRank', 'region', 'summonerName', 'lane', 'role'], axis=1).values
X_columns = df.drop(['tier', 'rank', 'tierRank', 'region', 'summonerName', 'lane', 'role'], axis=1).columns
y = df[['tier']].to_numpy().ravel()

In [17]:
# encoding categorical data e.g. tier as a dummy variable
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:,1] = labelencoder_X.fit_transform(X[:,1])

# encoding categorical data e.g. tier as a dummy variable
y,class_names = pd.factorize(y)

In [18]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state = 42)

In [19]:
# Fitting Classifier to the Training Set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='gini', max_depth=27, random_state=42)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=27, random_state=42)

In [22]:
# Model performance on training set
y_pred_train =classifier.predict(X_train)

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

accuracy = metrics.accuracy_score(y_train, y_pred_train)
print("Accuracy: {:.2f}".format(accuracy))
cm=confusion_matrix(y_train,y_pred_train)
print('Confusion Matrix: \n', cm)
print(classification_report(y_train, y_pred_train, target_names=class_names))

Accuracy: 1.00
Confusion Matrix: 
 [[1491    0    0    0    0    0]
 [   0 1500    0    0    0    0]
 [   0    0 1483    0    0    0]
 [   0    0    0 1478    0    0]
 [   0    0    0    0 1484    0]
 [   0    0    0    0    0 1485]]
              precision    recall  f1-score   support

        IRON       1.00      1.00      1.00      1491
      BRONZE       1.00      1.00      1.00      1500
      SILVER       1.00      1.00      1.00      1483
        GOLD       1.00      1.00      1.00      1478
    PLATINUM       1.00      1.00      1.00      1484
     DIAMOND       1.00      1.00      1.00      1485

    accuracy                           1.00      8921
   macro avg       1.00      1.00      1.00      8921
weighted avg       1.00      1.00      1.00      8921



In [23]:
# Predicting the test results
y_pred=classifier.predict(X_test)

# Classification results on test set
from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}".format(accuracy))

from sklearn.metrics import confusion_matrix, classification_report
cm=confusion_matrix(y_test,y_pred)
print('Confusion Matrix: \n', cm)
print(classification_report(y_test, y_pred, target_names=class_names))

Accuracy: 0.79
Confusion Matrix: 
 [[410  48  25   8   3   3]
 [ 39 389  35   9  20   8]
 [ 16  37 395  21  17   9]
 [ 16  22  16 381  25  32]
 [ 11  24  18  22 385  35]
 [  1   9  15  20  52 398]]
              precision    recall  f1-score   support

        IRON       0.83      0.82      0.83       497
      BRONZE       0.74      0.78      0.76       500
      SILVER       0.78      0.80      0.79       495
        GOLD       0.83      0.77      0.80       492
    PLATINUM       0.77      0.78      0.77       495
     DIAMOND       0.82      0.80      0.81       495

    accuracy                           0.79      2974
   macro avg       0.79      0.79      0.79      2974
weighted avg       0.79      0.79      0.79      2974



In [26]:
# Visualize the tree by graphiz
import graphviz
from sklearn import tree
import os
os.environ["PATH"] += os.pathsep + 'C:/Users/matia/anaconda3/Library/bin/graphviz/'
feature_names = X_columns
dot_data = tree.export_graphviz(classifier, out_file=None, filled=True, rounded = True, feature_names=feature_names, class_names=class_names)
graph = graphviz.Source(dot_data)
graph

ModuleNotFoundError: No module named 'graphviz'