# Project 2 - ML
### Run - Random Forests with sub-samples for the hyper parameters tuning

In [1]:
from classifiers import *
from helpers import *

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Pre-processing of our data and creation of our machine learning entities X and y:

In [3]:
# Change the file according to where the .parquet files are in your machine

file = 'data/All_Relative_Results_Cleaned.parquet'
data = pd.read_parquet(file)

X, y, y_encoded = clean_data(data)

The initial data contains 27187 of NaN values
There is 27187 rows with 100 percents of NaN positional values.
Hence all rows can be removed from our data set.
Label y extracted!
Label y encoded!
Matrix X created!


In this method, we want to see if the camera has an impact in the prediction. Hence, we create four sub-matrices of `X` for each camera: Frontal Top (ft), Frontal Low (fl), Side Top (st) and Side Low (sl). 

In [6]:
X_ft, y_encoded_ft, X_fl, y_encoded_fl, X_st, y_encoded_st, X_sl, y_encoded_sl = submatrices_cameras(X, y_encoded)

In [7]:
print('The shapes of the 4 sub-matrices are: \n Frontal top camera:', X_ft.shape,
        '\n Frontal low camera:', X_fl.shape,
        '\n Side top camera:', X_st.shape,
        '\n Side low camera:', X_sl.shape)

The shapes of the 4 sub-matrices are: 
 Frontal top camera: (552645, 103) 
 Frontal low camera: (552641, 103) 
 Side top camera: (547570, 103) 
 Side low camera: (530629, 103)


---
# Separation of Train-Test using the first method:

The first method consist of removing the time column from `X` and a split of 50% for
testing and 50% for training is performed randomly.

We can now start our classification for each matrix and compare our result.

In [None]:
X_names_M1 = list(('X_train_M1', 'X_test_M1'))

test_labels_M1, accuracy_M1 = train_test_clustering_rf(X, X_names_M1, y_encoded, 'M1', True, False)

In [None]:
# Frontal Top
X_names_ft_M1 = list(('X_Frontal_Top_train_M1', 'X_Frontal_Top_test_M1'))
test_labels_ft_M1, accuracy_ft_M1 = train_test_clustering_rf(X_ft, X_names_ft_M1, y_encoded_ft, 'M1', True, False)

# Frontal Low
X_names_fl_M1 = list(('X_Frontal_Low_train_M1', 'X_Frontal_Low_test_M1'))
test_labels_fl_M1, accuracy_fl_M1 = train_test_clustering_rf(X_fl, X_names_fl_M1, y_encoded_fl, 'M1', True, False)

# Side Top
X_names_st_M1 = list(('X_Side_Top_train_M1', 'X_Side_Top_test_M1'))
test_labels_st_M1, accuracy_st_M1 = train_test_clustering_rf(X_st, X_names_st_M1, y_encoded_st, 'M1', True, False)

# Side Low
X_names_sl_M1 = list(('X_Side_Low_train_M1', 'X_Side_Low_test_M1'))
test_labels_sl_M1, accuracy_sl_M1 = train_test_clustering_rf(X_sl, X_names_sl_M1, y_encoded_sl, 'M1', True, False)

In [None]:
# Let us compute the average accuracy when looking at the different cameras:
accuracy_cameras_M1 = np.array([accuracy_ft_M1, accuracy_fl_M1, accuracy_st_M1, accuracy_sl_M1])
avg_accuracy_M1 = accuracy_cameras_M1.mean()
print(f'Average accuracy: {100*avg_accuracy_M1}%')
# Now, let us compute the difference in accuracy between our two different results:
difference_in_accuracy_M1 = (avg_accuracy_M1 - accuracy_M1)
print(f'The difference in accuracy is: {difference_in_accuracy_M1*100}%')

---
# Separation of Train-Test using the second method:

The second method consist of splitting based on participants and we keep the time column, with 12 participants reserved for testing and 13 for training.

In [None]:
X_names_M2 = list(('X_train_M2', 'X_test_M2'))

test_labels_M2, accuracy_M2 = train_test_clustering_rf(X, X_names_M2, y_encoded, 'M2', True, False)

In [None]:
# Frontal Top
X_names_ft_M2 = list(('X_Frontal_Top_train_M2', 'X_Frontal_Top_test_M2'))
test_labels_ft_M2, accuracy_ft_M2 = train_test_clustering_rf(X_ft, X_names_ft_M2, y_encoded_ft, 'M2', True, False)

# Frontal Low
X_names_fl_M2 = list(('X_Frontal_Low_train_M2', 'X_Frontal_Low_test_M2'))
test_labels_fl_M2, accuracy_fl_M2 = train_test_clustering_rf(X_fl, X_names_fl_M2, y_encoded_fl, 'M2', True, False)

# Side Top
X_names_st_M2 = list(('X_Side_Top_train_M2', 'X_Side_Top_test_M2'))
test_labels_st_M2, accuracy_st_M2 = train_test_clustering_rf(X_st, X_names_st_M2, y_encoded_st, 'M2', True, False)

# Side Low
X_names_sl_M2 = list(('X_Side_Low_train_M2', 'X_Side_Low_test_M2'))
test_labels_sl_M2, accuracy_sl_M2 = train_test_clustering_rf(X_sl, X_names_sl_M2, y_encoded_sl, 'M2', True, False)

In [None]:
# Let us compute the average accuracy when looking at the different cameras:
accuracy_cameras_M2 = np.array([accuracy_ft_M2, accuracy_fl_M2, accuracy_st_M2, accuracy_sl_M2])
avg_accuracy_M2 = accuracy_cameras_M2.mean()
print(f'Average accuracy: {100*avg_accuracy_M2}%')

# Now, let us compute the difference in accuracy between our two different results:
difference_in_accuracy_M2 = (avg_accuracy_M2 - accuracy_M2)
print(f'The difference in accuracy is: {difference_in_accuracy_M2*100}%')

---
# Separation of Train-Test using the third method:

The third method consist of taking a certain percentage p for the training set of each unique values of the feature ’Camera’, so that training take into consideration each cameras.

Remark that for this method, we cannot compare the sub-matrices of each cameras and our whole matrix.

In [None]:
X_names_M3 = list(('X_train_M3', 'X_test_M3'))

test_labels_M3, accuracy_M3 = train_test_clustering_rf(X, X_names_M3, y_encoded, 'M3', True, False)

--- 
# Separation of Train-Test using the fourth method:

The third method consist of taking a certain percentage p for the training set of each unique values of the feature ’Set’, so that training take into consideration each errors.

In [None]:
X_names_M4 = list(('X_train_M4', 'X_test_M4'))

test_labels_M4, accuracy_M4 = train_test_clustering_rf(X, X_names_M4, y_encoded, 'M4', True, False)

In [None]:
# Frontal Top
X_names_ft_M4 = list(('X_Frontal_Top_train_M4', 'X_Frontal_Top_test_M4'))
test_labels_ft_M4, accuracy_ft_M4 = train_test_clustering_rf(X_ft, X_names_ft_M4, y_encoded_ft, 'M4', True, False)

# Frontal Low
X_names_fl_M4 = list(('X_Frontal_Low_train_M4', 'X_Frontal_Low_test_M4'))
test_labels_fl_M4, accuracy_fl_M4 = train_test_clustering_rf(X_fl, X_names_fl_M4, y_encoded_fl, 'M4', True, False)

# Side Top
X_names_st_M4 = list(('X_Side_Top_train_M4', 'X_Side_Top_test_M4'))
test_labels_st_M4, accuracy_st_M4 = train_test_clustering_rf(X_st, X_names_st_M4, y_encoded_st, 'M4', True, False)

# Side Low
X_names_sl_M4 = list(('X_Side_Low_train_M4', 'X_Side_Low_test_M4'))
test_labels_sl_M4, accuracy_sl_M4 = train_test_clustering_rf(X_sl, X_names_sl_M4, y_encoded_sl, 'M4', True, False)

In [None]:
# Let us compute the average accuracy when looking at the different cameras:
accuracy_cameras_M4 = np.array([accuracy_ft_M4, accuracy_fl_M4, accuracy_st_M4, accuracy_sl_M4])
avg_accuracy_M4= accuracy_cameras_M4.mean()
print(f'Average accuracy: {100*avg_accuracy_M4}%')

# Now, let us compute the difference in accuracy between our two different results:
difference_in_accuracy_M4 = (avg_accuracy_M4 - accuracy_M4)
print(f'The difference in accuracy is: {difference_in_accuracy_M4*100}%')