In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import plot_tree, export_text
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint

### Dataset

In [None]:
#Read dataframe 
df = pd.read_csv('ctg.csv')
df

In [None]:
#Split with ratio
#Train:Test = 0.7:0.3

train, test = train_test_split(df, test_size=0.3)

In [None]:
#Features & Labels

X_train = train[train.columns[:-1]].values
y_train = train[train.columns[-1]].values

X_test = test[test.columns[:-1]].values
y_test = test[test.columns[-1]].values


### Model

In [None]:
###Construct Model

#Declare hyperparameter options
model_params = {
    'n_estimators': [10, 15, 20, 25],
    'criterion': ['entropy', 'gini'],
    'max_features': ['sqrt', 'auto', 'log2', 0.25, 0.5, 0.75, 1.0],
    'min_samples_split': [2, 4, 6],
    'max_depth': [4,5,6,7],
}

#Initiate classifier
rf_model = RandomForestClassifier(random_state=123)
classifier = GridSearchCV(rf_model, model_params, cv=5)

#Train the grid search to find the best model
model = classifier.fit(X_train, y_train)

#Print best hyperparameter set
pprint(model.best_estimator_.get_params())


### Result

In [None]:
###Evaluation Metrics

#Predict using test data
y_pred = model.predict(X_test)

#Confusion matrix
print('Confusion Matrix \n', confusion_matrix(y_test, y_pred))

#Get evaluation metrics
report = classification_report(y_test, y_pred, digits=5, output_dict=True)

#Generate report in a dataframe
ev_metrics = pd.DataFrame(report).transpose()
ev_metrics['support'] = ev_metrics['support'].astype(int)
acc = ev_metrics.loc['accuracy'][0]
ev_metrics.drop(['accuracy', 'macro avg'], inplace=True)
ev_metrics.index = ['Normal', 'Suspect', 'Pathologic', 'Weighted Average']

#Style for weighted average
ev_metrics.style.set_table_styles({
    'Weighted Average': [{'selector': '',
         'props': [('border-top', '2px solid black')]}]
}, axis=1, overwrite=False)


In [None]:
###Feature Importance

def plot_feature_importance(feature_importance,feature_names,title):

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=df['feature_importance'], y=df['feature_names'])
    #Add chart labels
    plt.title(title)
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

#Show ranked feature importances
plot_feature_importance(model.best_estimator_.feature_importances_,df.columns[:-1],'CTG-RF')

### Edge Implementations

In [None]:
###Show all trees in this forest

print('Total Trees in this Forest are :', len(model.best_estimator_.estimators_))
model.best_estimator_.estimators_

In [None]:
###Show the structure of the first tree only

#This structure is the one we can use for rewriting the model in C++
print('All if-else statements from the tree\n')
print(export_text(model.best_estimator_.estimators_[0], feature_names=list(df.columns[:-1])))

In [None]:
###Visualize tree if needed

# fig = plt.figure(figsize=(100, 100))
# plot_tree(model.best_estimator_.estimators_[0], 
#           feature_names=df.columns[:-1],
#           class_names=df.columns[-1], 
#           filled=True, impurity=True, 
#           rounded=True)
# plt.show()


In [None]:
###Parse to C++

#This micromlgen is a useful parser for the model.
#We can also write our own parser, as long as
#the model's tree structure can be exported

from micromlgen import port

#Save best model in a file
lib = port(model.best_estimator_, classmap={1:'Normal', 2:'Suspect', 3:'Pathologic'})

add_header = '#include "stdint.h"\n'
pos = lib.find('namespace')
updated_lib = lib[:pos] + add_header + lib[pos:]

add_index = '+1' #micromlgen can't start at 1, always 0.
pos = lib.find('return classIdx') + len('return classIdx')
updated_lib = lib[:pos] + add_index + lib[pos:]

with open('src/model.h', 'w') as f:
    print(updated_lib, file=f)

In [None]:
#On linux only, checking model size

# !ls -l 'src/model.h'

In [None]:
### Write test file to C++

#Downcast Python(float64 to float32) (C++ double to float)
test = test.copy()
test.reset_index(inplace=True, drop=True)
# test = test.round(6)
x_str = str(test[test.columns[:-1]].to_numpy().tolist())
y_str = str(test[test.columns[-1]].to_numpy().tolist())

row = test.shape[0]
col = test.shape[1]


#Write X_test
x_str = x_str.replace("[", "{").replace("]", "}")
with open('src/test_data.cc', 'w') as f:
    print(f'float x_test[{row}][{col}]=', file=f)
    print(x_str, file=f, end='')
    print(';', file=f)
    print('\n', file=f)
    
    
#Append y_test
y_str = y_str.replace("[", "{").replace("]", "}")
with open('src/test_data.cc', 'a') as f:
    print(f'int y_test[{row}]=', file=f)
    print(y_str, file=f, end='')
    print(';', file=f)
    
#Now create the header file (.h)
with open('src/test_data.h', 'w') as f:
    print(f'extern float x_test[{row}][{col}];', file=f)
    print(f'extern int y_test[{row}];', file=f)