In [1]:

### Experiment 3 Over Sampling random forest (OSRF)


## the code references 

## Sullivan W.,2017. Python machine learning illustrated guide for beginners. Healthy pragmatic solutions Inc.
##
## Liu Y.(Hayden), 2017. Python machine learning by example. Birmingham-Mumbai: Packt.
##
##Imbalanced-learn, imblearn.over_sampling.SMOTE ,available online 
##https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html, Last accessed 22/04/2019
##
##Stackoverflow, Scikit-learn, get accuracy scores for each class, 
##available online https://stackoverflow.com/questions/39770376/scikit-learn-get-accuracy-scores-for-each-class. 
##Last accessed 22/04/2019
## 
##Medium, AUC ROC Curve Scoring Function for Multi-class Classification, 
##available online https://medium.com/@plog397/auc-roc-curve-scoring-function-for-multi-class-classification-9822871a6659. 
##Last accessed 22/04/2019
## 



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold # import KFold
from sklearn.ensemble import RandomForestClassifier
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE


yeast_data = pd.read_csv('yeast_data.txt', names= ['mcg','gvh','alm','mit','erl', 'pox','vac','nuc','target'])

## split the data to training and testing datasets (Sullivan,2017)
print(yeast_data.groupby('target').size())
features = yeast_data.iloc[:,0:8].values
labels = yeast_data.iloc[:,8].values
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3, random_state= 0)

########################################################################

## oversampling by SMOTE algorithm to balance the training data set(Imbalanced-learn)

from collections import Counter
from imblearn.over_sampling import SMOTE

smote= SMOTE(sampling_strategy = 'not majority', random_state= 2, k_neighbors = 2)
new_train_features, new_train_labels = smote.fit_sample(train_features, train_labels)
print ("the balance classes")
print()
print(sorted(Counter(new_train_labels).items()))

###################################################################################

## fit the balance training data to random forest and tuning the parameters by grid search(Liu Y., 2017)

rf_clf = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=10) 


parameters = {'n_estimators' : [100,300,500,700,900],
              'max_features' : ["sqrt", "log2", None],
              'max_depth'    : [10, 20, None],
              'min_samples_split': [10,30,50] }

from sklearn.model_selection import GridSearchCV

gd_sr = GridSearchCV(estimator= rf_clf,  
                     param_grid=parameters,
                     scoring='accuracy',
                     cv=5,
                     n_jobs= -1)

gd_sr.fit( new_train_features, new_train_labels)
best_parameters = gd_sr.best_params_  
print(best_parameters)

best_result = gd_sr.best_score_  
print (" the best result is")
print(best_result)  

################################################################################################

## predict the imbalanced test set using best result from grid search
predictions = gd_sr.best_estimator_.predict(test_features)


## find accuracy and confusion matrix (Sullivan,2017)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(test_labels, predictions))
print(classification_report(test_labels, predictions))
print(accuracy_score(test_labels, predictions))


### find the accuracy of each class (Stackoverflow)
cm = confusion_matrix(test_labels, predictions)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print ("accuracy of each class")
print()
print(cm.diagonal())
  

### calculate the avarage auc_roc for the classes(Medium)
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
def multiclass_roc_auc_score(truth, pred, average="macro"):
    
    lb = LabelBinarizer()
    lb.fit(truth)

    truth = lb.transform(truth)
    pred = lb.transform(pred)

    return roc_auc_score(truth, pred, average=average)

print("Area under curve ROC is:")
multiclass_roc_auc_score(test_labels, predictions)



target
CYT    463
ERL      5
EXC     35
ME1     44
ME2     51
ME3    163
MIT    244
NUC    429
POX     20
VAC     30
dtype: int64
the balance classes

[('CYT', 323), ('ERL', 323), ('EXC', 323), ('ME1', 323), ('ME2', 323), ('ME3', 323), ('MIT', 323), ('NUC', 323), ('POX', 323), ('VAC', 323)]
{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 900}
 the best result is
0.8708978328173375
[[81  0  0  0  3  3 13 33  2  5]
 [ 0  1  0  0  0  0  0  0  0  0]
 [ 1  0  7  0  1  0  1  0  0  1]
 [ 0  0  0 15  1  1  0  0  1  1]
 [ 2  0  0  3  5  1  0  0  0  1]
 [ 0  0  1  0  1 39  0  3  1  0]
 [11  0  1  0  2  5 47  9  1  1]
 [35  0  1  0  3  7  9 73  0  0]
 [ 1  0  0  0  0  0  0  1  3  0]
 [ 5  0  1  0  0  2  0  0  0  0]]
              precision    recall  f1-score   support

         CYT       0.60      0.58      0.59       140
         ERL       1.00      1.00      1.00         1
         EXC       0.64      0.64      0.64        11
         ME1       0.83      0.79

0.7780443387028415