In [1]:
# Experiment 1 Random Forest (RF)

## the code references 

## Sullivan W.,2017. Python machine learning illustrated guide for beginners. Healthy pragmatic solutions Inc.
##
## Liu Y.(Hayden), 2017. Python machine learning by example. Birmingham-Mumbai: Packt.
##
##Stackoverflow, Scikit-learn, get accuracy scores for each class, 
##available online https://stackoverflow.com/questions/39770376/scikit-learn-get-accuracy-scores-for-each-class. 
##Last accessed 22/04/2019
## 
##Medium, AUC ROC Curve Scoring Function for Multi-class Classification, 
##available online https://medium.com/@plog397/auc-roc-curve-scoring-function-for-multi-class-classification-9822871a6659. 
##Last accessed 22/04/2019
## 



## This program first implement Random Forest  with imblanced data using grid search to tuning the parameters and predict the accuracy  
## split the data to 70% training set and 30% test set

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier


## split the data to training and testing datasets (Sullivan,2017)

yeast_data = pd.read_csv('yeast_data.txt', names= ['mcg','gvh','alm','mit','erl', 'pox','vac','nuc','target'])

features = yeast_data.iloc[:,0:8].values
labels = yeast_data.iloc[:,8].values
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3, random_state= 0)

###########################################################################################

rf_clf = RandomForestClassifier(n_estimators=200, oob_score=True ,random_state=10)   

## Hyper parameter using grid search(Liu, 2017)

parameters = {'n_estimators' : [100,300,500,700,900],
              'max_features' : ["sqrt", "log2", None],
              'max_depth'    : [10, 20, None],
              'min_samples_split': [2,5,10] }

from sklearn.model_selection import GridSearchCV

gd_sr = GridSearchCV(estimator= rf_clf,  
                     param_grid=parameters,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)


gd_sr.fit( train_features, train_labels)
best_parameters = gd_sr.best_params_  
print(best_parameters)


best_result = gd_sr.best_score_  
print(best_result)  

###########################################################################

## predict the test set using best result

predictions = gd_sr.best_estimator_.predict(test_features)

#comparison = pd.DataFrame({'Real':test_labels, 'Predictions': predictions})
#print(comparison)


## find accuracy and confusion matrix (Sullivan,2017)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(test_labels, predictions))
print(classification_report(test_labels, predictions))
print("accuracy_score" , accuracy_score(test_labels, predictions))
print( )


### find the accuracy of each class (Stackoverflow)

cm = confusion_matrix(test_labels, predictions)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print ("accuracy of each class")
print()
print(cm.diagonal())

 
### calculate the avarage auc_roc for the classes(Medium)

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
def multiclass_roc_auc_score(truth, pred, average="macro"):

    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)
    pred = lb.transform(pred)

    return roc_auc_score(truth, pred, average=average)

print("the avarage Area under curve ROC is:")
multiclass_roc_auc_score(test_labels, predictions)




{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 300}
0.6117533718689788
[[101   0   0   0   2   3   6  26   1   1]
 [  0   1   0   0   0   0   0   0   0   0]
 [  3   0   7   0   1   0   0   0   0   0]
 [  0   0   2  14   1   1   0   0   1   0]
 [  3   0   1   2   4   0   2   0   0   0]
 [  2   0   0   0   0  40   0   3   0   0]
 [ 19   0   0   0   2   4  46   6   0   0]
 [ 42   0   0   0   2   6   9  69   0   0]
 [  2   0   0   0   0   0   0   0   3   0]
 [  5   0   1   0   0   2   0   0   0   0]]
              precision    recall  f1-score   support

         CYT       0.57      0.72      0.64       140
         ERL       1.00      1.00      1.00         1
         EXC       0.64      0.64      0.64        11
         ME1       0.88      0.74      0.80        19
         ME2       0.33      0.33      0.33        12
         ME3       0.71      0.89      0.79        45
         MIT       0.73      0.60      0.66        77
         NUC       0.66      0

0.7784895670629237