In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
X_train = pd.read_csv("X_train.csv", sep=',')
Y_train = pd.read_csv("Y_train.csv", sep =',')

#Exclude index out of CSV file from matrix
X_train = X_train.iloc[:, 1:]
Y_train = Y_train.iloc[:, 1:]

#factorize the ratings train set
Y_train = Y_train["Rating as Factor"].astype("category")
Y_train = pd.DataFrame(Y_train, columns=["Rating as Factor"])

#Same as above but for test set
X_test = pd.read_csv("X_test.csv", sep=',')
Y_test = pd.read_csv("Y_test.csv", sep =',')

#Exclude index out of CSV file from matrix
X_test = X_test.iloc[:, 1:]
Y_test = Y_test.iloc[:, 1:]

#factorize the ratings for test set
Y_test = Y_test["Rating as Factor"].astype("category")
Y_test = pd.DataFrame(Y_test, columns=["Rating as Factor"])


In [3]:
#Define function for RandomForestClassifier
def random_forest(x_train, y_train, x_test, y_test):
    """Apply RandomForestClassifier and get test score on test set.
    
    Args:
        x_train: train feature matrix
        y_train: train classification input
        x_test: test feature matrix
        y_test: test classification input
    """
    # Create classifier object and fit it to data
    forest = RandomForestClassifier(criterion='gini', random_state=0, n_jobs=-1)
    forest.fit(x_train, y_train)
    
    # Print test score 
    print('Test accuracy: {0: .4f}'.format(forest.score(x_test, y_test)))

In [4]:
#Use function on our datasets
random_forest(X_train, Y_train, X_test, Y_test)

  del sys.path[0]


Test accuracy:  0.9731


In [5]:
#Stadardize Feature Matrix 

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train.iloc[:,:])  # fit & transform
X_test_std  = stdsc.transform(X_test.iloc[:,:])  # ONLY transform

In [6]:
#Apply random forest function on standardized datasets
random_forest(X_train_std, Y_train, X_test_std, Y_test)

  del sys.path[0]


Test accuracy:  0.9731


In [11]:
#Define arrays of values to be tested for paramgrid inside GridSearchCV function

#number of trees in the forest
n_estimators = np.array([70])

# Max depth
maxDepth = np.array([25])

# Minimum number of samples required to split any internal node 
minSamplesNode = np.array([2, 3, 5, 7])

# The minimum number of samples required to be at a leaf/terminal node
minSamplesLeaf = np.array([1, 3, 5, 7])

In [12]:
def random_forest_g(x_train, y_train, x_test, y_test, n_estimators, maxDepth, minSamplesNode, minSamplesLeaf):
    """ Apply GridSearchCV with RandomForestClassifier. Test score and best parameters will be printed out.
    
    Args:
        x: train feature matrix
        y: train classification input
        x_test: test feature matrix
        y_test: test classification input
        n_estimators: array of values which will be tested for variable n_estimators
        maxDepth: array of values which will be tested for variable max_depth
        minSamplesNode: array of values which will be tested for variable min_samples_split
        minSamplesLeaf: array of values which will be tested for variable min_samples_leaf
    """
    # Define the hyperparameter values to be tested
    param_grid = {'criterion': ['gini', 'entropy'],
                  "n_estimators": n_estimators,
                  'max_depth': maxDepth,
                  'min_samples_split': minSamplesNode,
                  'min_samples_leaf': minSamplesLeaf},

    # Run brute-force grid search
    gs = GridSearchCV(estimator=RandomForestClassifier(random_state=0),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv= 5, n_jobs=-1)
    gs = gs.fit(x_train, y_train)
    
    print('Best CV accuracy: {:.2f}'.format(gs.best_score_))
    print('Test score:       {:.2f}'.format(gs.score(x_test, y_test)))
    print('Best parameters: {}'.format(gs.best_params_))

In [13]:
random_forest_g(x_train = X_train, y_train = Y_train, x_test = X_test, y_test = Y_test, n_estimators = n_estimators, maxDepth = maxDepth, minSamplesNode = minSamplesNode, minSamplesLeaf = minSamplesLeaf)

  self.best_estimator_.fit(X, y, **fit_params)


Best CV accuracy: 0.97
Test score:       0.98
Best parameters: {'criterion': 'gini', 'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 70}
