In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.utils import shuffle
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Random Forest/Descision Tree Parameters

**Bootstrap**: If set to True, then bagging will use sampling with replacements (same training instance can be sampled more than once)

**Number of Trees/Estimators**: In general, the more trees you use the better get the results. However, the improvement decreases as the number of trees increases, i.e. at a certain point the benefit in prediction performance from learning more trees will be lower than the cost in computation time for learning these additional trees. One can also perform an out-of-bag cross-validation approach and plot the results to find the optimal number of trees. If you have a large number of features, then you should also have a large number of trees so that the model isn't underfit.

**Minimum Samples Split**: Each node in a decision tree is split based on the value of a single feature. As you go down the decision tree, the number of training samples that end up along a certain branch decreases. Splitting typically stops when the number of traning samples that end up at a node is less than the set minimum, which turns the node into a leaf node. Too large of a minimum samples split value and the tre will be too short and inaccurate. Too small of a minimum samples split value and the tree will take longer to train and overfit more to the training data.

**Maximum Depth of Tree**: The lower the number, the less accurate your model will be.  The higher the number, the more accurate, but the more you risk overfitting. Trees in a random forest should help overcome the overfitting.

**Maximum Features**: Refers to the random subspace method of choosing a random subset of features from the feature space. sqrt(p) is the optimal value from experience.

**Min Samples Leaf**: Prevents hardly helpful decision branching to take place. For example, if there was a leaf node with one sample attributed to it, then that branch would be of little value.

**Max Leaf Nodes**: Another restriction on tree growth.

## Random Forest on Prog Classification

In [0]:
X = np.load('./drive/My Drive/X_V2.npy')
print(X.shape)

y = np.load('./drive/My Drive/Y_V2.npy')
print(y.shape)

# Shuffle and split the dataset to training and testing
X, y = shuffle(X, y, random_state=0)
X = np.reshape(X, (375,52470))
print(X.shape)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.10, random_state=0)

(375, 1590, 33)
(375, 1)
(375, 52470)


In [0]:
# Training

parameters = {'bootstrap': True,
              'min_samples_leaf': 3,
              'n_estimators': 2000,
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 50,
              'max_leaf_nodes': None}

RF_model = RandomForestClassifier(**parameters)
RF_model.fit(train_X, train_y)

RF_predictions = RF_model.predict(test_X)
score = accuracy_score(test_y, RF_predictions)
print(f"Accuracy on 90/10 train split: {score}")
print(confusion_matrix(test_y, RF_predictions))

# Validation

X_validation = np.load('./drive/My Drive/X_Validation_1.npy')
X_validation = X_validation[:,:1590,:33]

y_validation = np.load('./drive/My Drive/Y_Validation_1.npy')

X_validation, y_validation = shuffle(X_validation, y_validation, random_state=0)
X_validation = np.reshape(X_validation, (92,52470))

RF_predictions = RF_model.predict(X_validation)
score = accuracy_score(y_validation, RF_predictions)
print(f"Score on the Validation Set: {score}")
print(confusion_matrix(y_validation, RF_predictions))

  # This is added back by InteractiveShellApp.init_path()


Accuracy on 90/10 train split: 0.8947368421052632
[[34  0]
 [ 4  0]]
Score on the Validation Set: 0.7717391304347826
[[71  0]
 [21  0]]
