In [1]:
### Experiment 2 Class Weights random forest (CWsRF)


## the code references 

## Sullivan W.,2017. Python machine learning illustrated guide for beginners. Healthy pragmatic solutions Inc.
##
## Liu Y.(Hayden), 2017. Python machine learning by example. Birmingham-Mumbai: Packt.
##
## Scikit-learn, sklearn.utils.class_weight, available online 
##https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html. Last accessed 22/04/2019
##
##Stackoverflow, Scikit-learn, get accuracy scores for each class, 
##available online https://stackoverflow.com/questions/39770376/scikit-learn-get-accuracy-scores-for-each-class. 
##Last accessed 22/04/2019
## 
##Medium, AUC ROC Curve Scoring Function for Multi-class Classification, 
##available online https://medium.com/@plog397/auc-roc-curve-scoring-function-for-multi-class-classification-9822871a6659. 
##Last accessed 22/04/2019
## 


## This program implement Random Forest with class weights  and split the data to 70% for training and 30% testing


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler


yeast_data = pd.read_csv('yeast_data.txt', names= ['mcg','gvh','alm','mit','erl', 'pox','vac','nuc','target'])

print(yeast_data.groupby('target').size())

## split the data to training and testing datasets (Sullivan,2017)
features = yeast_data.iloc[:,0:8].values
labels = yeast_data.iloc[:,8].values
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3, random_state= 0)


## weights of the classes(Scikit-learn)
class_weight =class_weight.compute_class_weight('balanced', np.unique(train_labels),train_labels)

print(class_weight)

         


target
CYT    463
ERL      5
EXC     35
ME1     44
ME2     51
ME3    163
MIT    244
NUC    429
POX     20
VAC     30
dtype: int64
[ 0.32136223 25.95        4.325       4.152       2.66153846  0.87966102
  0.62155689  0.3448505   6.92        4.71818182]


In [2]:
  

class_weight = dict({'CYT':0.32136223, 'ERL':25.95    ,'EXC': 4.325 ,'ME1': 4.152   ,'ME2': 2.66153846  ,'ME3': 0.87966102,
 'MIT':0.62155689  , 'NUC':0.3448505, 'POX': 6.92   , 'VAC':4.71818182})


rf_clf1 = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=10, class_weight = class_weight)
     
    
## Hyper parameters using grid search(Liu Y., 2017)
parameters = {'n_estimators' : [100,300,500,700,900],
              'max_features' : ["sqrt", "log2", None],
              'max_depth'    : [10, 20, None],
              'min_samples_split': [10,30,50]}
               

from sklearn.model_selection import GridSearchCV

gd_sr = GridSearchCV(estimator= rf_clf1,  
                     param_grid=parameters,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)


gd_sr.fit( train_features, train_labels)
best_parameters = gd_sr.best_params_  
print(best_parameters)


best_result = gd_sr.best_score_  
print(best_result)  

## predict the test set using best result 
predictions = gd_sr.best_estimator_.predict(test_features)





{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 500}
0.6107899807321773


In [3]:

#comparison = pd.DataFrame({'Real':test_labels, 'Predictions': predictions})
#print(comparison)

## find accuracy and confusion matrix (Sullivan,2017)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(test_labels, predictions))
print(classification_report(test_labels, predictions))
print("accuracy_score" , accuracy_score(test_labels, predictions))
print( )


### find the accuracy of each class (Stackoverflow)
cm = confusion_matrix(test_labels, predictions)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print ("accuracy of each class")
print()
print(cm.diagonal())

### calculate the avarage auc_roc for the classes(Medium) 
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
def multiclass_roc_auc_score(truth, pred, average="macro"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)
    pred = lb.transform(pred)

    return roc_auc_score(truth, pred, average=average)

print("Area under curve ROC is:")
multiclass_roc_auc_score(test_labels, predictions)



[[87  0  0  0  2  4 13 30  1  3]
 [ 0  1  0  0  0  0  0  0  0  0]
 [ 1  0  7  0  1  0  1  0  0  1]
 [ 0  0  1 17  0  1  0  0  0  0]
 [ 0  1  1  4  4  0  0  1  0  1]
 [ 2  0  0  0  0 41  0  2  0  0]
 [14  0  1  0  2  5 45  9  1  0]
 [35  0  0  0  3  7 10 73  0  0]
 [ 2  0  0  0  0  0  0  0  3  0]
 [ 5  0  1  0  0  2  0  0  0  0]]
              precision    recall  f1-score   support

         CYT       0.60      0.62      0.61       140
         ERL       0.50      1.00      0.67         1
         EXC       0.64      0.64      0.64        11
         ME1       0.81      0.89      0.85        19
         ME2       0.33      0.33      0.33        12
         ME3       0.68      0.91      0.78        45
         MIT       0.65      0.58      0.62        77
         NUC       0.63      0.57      0.60       128
         POX       0.60      0.60      0.60         5
         VAC       0.00      0.00      0.00         8

   micro avg       0.62      0.62      0.62       446
   macro avg       

0.7829599662981508