In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
import numpy as np

Load the data and explore it:

In [2]:
data = pd.read_csv("data/dataset.csv")
data.head(3)

Unnamed: 0,emotion,AU01,AU02,AU04,AU05,AU06,AU07,AU09,AU10,AU11,...,AU14,AU15,AU17,AU20,AU23,AU24,AU25,AU26,AU28,AU43
0,neutral,0.450774,0.289915,0.409713,0.518726,0.086218,0.0,0.187309,0.354838,0.0,...,0.32069,0.411641,0.431646,0.0,0.277122,0.335435,0.262999,0.189863,0.051967,0.05137
1,disgust,0.50045,0.314694,0.625174,0.335747,0.262984,0.0,0.504238,0.383201,0.0,...,0.544159,0.440429,0.495913,0.0,0.514737,0.420401,0.052358,0.143576,0.500994,0.155117
2,sad,0.273191,0.191327,0.140938,0.358091,0.246593,0.0,0.312881,0.188845,1.0,...,0.284598,0.761539,0.491468,0.0,0.134049,0.670237,0.024796,0.109462,0.325429,0.191367


In [3]:
data.columns

Index(['emotion', 'AU01', 'AU02', 'AU04', 'AU05', 'AU06', 'AU07', 'AU09',
       'AU10', 'AU11', 'AU12', 'AU14', 'AU15', 'AU17', 'AU20', 'AU23', 'AU24',
       'AU25', 'AU26', 'AU28', 'AU43'],
      dtype='object')

In [4]:
data.shape

(1161, 21)

In [5]:
# see class balance
for emo in data["emotion"].unique():
    print(f"Found {(data['emotion'] == emo).value_counts().iloc[1]} samples for class {emo}")

Found 313 samples for class neutral
Found 80 samples for class disgust
Found 73 samples for class sad
Found 306 samples for class happy
Found 164 samples for class surprise
Found 160 samples for class angry
Found 65 samples for class fear


Train and evaluation model function (from Lab 2):

In [6]:
def train_and_eval(model, train_in, train_out, val_in, val_out):
    model.fit(train_in, train_out)
    predicted_val = model.predict(val_in)

    # Evaluate model
    accuracy = accuracy_score(val_out, predicted_val)
    return accuracy

Process data and split it so we can train:

In [None]:
# Divide labels from inputs/features
labels = data["emotion"]
inputs = data.drop("emotion", axis=1)

# split = Train&Val -> 90 / Test -> 10
data_in, test_in, data_out, test_out = train_test_split(inputs, labels, test_size=0.1, random_state=42, stratify=labels)
# split = Train -> 70/ Val -> 20
train_in, val_in, train_out, val_out = train_test_split(data_in, data_out, test_size=(0.2/0.9), random_state=42, stratify=data_out)
print("\nLenght of each split of the data: ", len(train_in), len(val_in), len(test_in), "\n")


Lenght of each split of the data:  811 233 117 



Selecting Random Forest  as the model for the task:

In [8]:
rf_model = RandomForestClassifier(random_state=17)
print("\nAccuracy of classifier in validation set is: ", train_and_eval(rf_model, train_in, train_out, val_in, val_out)*100)
print("Accuracy on test set is: ", accuracy_score(test_out, rf_model.predict(test_in))*100)


Accuracy of classifier in validation set is:  60.94420600858369
Accuracy on test set is:  64.95726495726495


Let's check what parameters our model is using:

In [9]:
rf_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 17,
 'verbose': 0,
 'warm_start': False}

Hypertuning parameters for the model:

In [10]:
# Defining the parameters to try
param_grid = [{"n_estimators": [91, 117, 130, 172], "criterion": ["gini", "entropy", "log_loss"], "max_depth": [3, 7, 17]}]
# Hypertuning
random_forest_search = GridSearchCV(RandomForestClassifier(random_state=17), param_grid, cv=3)
random_forest_search.fit(train_in, train_out)
print("\nGrid Search")
print("\nRandom Forest with best parameters on val set: ", accuracy_score(val_out, random_forest_search.best_estimator_.predict(val_in))*100)
print("Random Forest with best parameters on test set: ", accuracy_score(test_out, random_forest_search.predict(test_in))*100)
print("Best parameters of the model: ", random_forest_search.best_params_)


Grid Search

Random Forest with best parameters on val set:  59.65665236051502
Random Forest with best parameters on test set:  66.66666666666666
Best parameters of the model:  {'criterion': 'gini', 'max_depth': 17, 'n_estimators': 130}


Accuracy is not improving much so I'd like to try with a random search instead:

In [11]:
param_dist = [{"n_estimators": np.arange(17, 237, 20),"criterion": ["gini", "entropy", "log_loss"], "max_depth": np.arange(3, 103, 10)}]
# Hypertuning
random_forest_search_random = RandomizedSearchCV(RandomForestClassifier(random_state=17), param_dist, cv=3)
random_forest_search_random.fit(train_in, train_out)
print("\n Randomized Search")
print("\nRandom Forest with best parameters on val set: ", accuracy_score(val_out, random_forest_search_random.best_estimator_.predict(val_in))*100)
print("Random Forest with best parameters on test set: ", accuracy_score(test_out, random_forest_search_random.predict(test_in))*100)
print("Best parameters of the model: ", random_forest_search_random.best_params_)


 Randomized Search

Random Forest with best parameters on val set:  62.231759656652365
Random Forest with best parameters on test set:  69.23076923076923
Best parameters of the model:  {'n_estimators': 197, 'max_depth': 33, 'criterion': 'entropy'}


It seem like there is a slight improvement in the accuracy of the validation set, so we keep the resulting model from the Random search.

Now to predict labels, let's load the data:

In [12]:
test_to_submit = pd.read_csv("data/test_to_submit.csv")
test_to_submit.shape

(233, 20)

Predict and save the data:

In [13]:
# Predict
predicted_labels = random_forest_search_random.predict(test_to_submit)
# Save predictions to 'outputs' file
with open("outputs", "w") as f:
    for label in predicted_labels:
        f.write(f"{label}\n")