Training and testing kNN model on dataset with one-hot-encoded columns and using mutual information for feature selection

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load data with one-hot-encoded columns
data_csv = "data_w_ohe.csv"
data_df = pd.read_csv(data_csv)
data_df = data_df.rename(columns={"Unnamed: 0":"index"})
data_df.head()

Unnamed: 0,index,metascore,scenes,characters,percent dialogue,locations,Positive,Anger,Disgust,Fear,...,Columbia Pictures,New Line Cinema,Paramount Pictures,Twentieth Century Fox,Universal Pictures,Warner Bros.,Francine Maisler,Mary Vernieu,Arnold Montey,Noah Segura
0,0,12.0,256.0,24.0,0.348611,109,596.0,235.0,158.0,402.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,54.0,284.0,98.0,0.645387,65,761.0,260.0,155.0,291.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,71.0,58.0,47.0,0.543055,2,563.0,277.0,155.0,239.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,68.0,178.0,76.0,0.314321,92,911.0,461.0,248.0,585.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,66.0,48.0,15.0,0.698996,15,318.0,158.0,150.0,158.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Data preprocessing
# Drop unrelated columns
data_df = data_df.drop("index", axis=1)

# Drop rows with 0 locations
data_df = data_df[data_df["locations"] != 0]

In [4]:
data_df

Unnamed: 0,metascore,scenes,characters,percent dialogue,locations,Positive,Anger,Disgust,Fear,Negative,...,Columbia Pictures,New Line Cinema,Paramount Pictures,Twentieth Century Fox,Universal Pictures,Warner Bros.,Francine Maisler,Mary Vernieu,Arnold Montey,Noah Segura
0,12.0,256.0,24.0,0.348611,109,596.0,235.0,158.0,402.0,590.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,54.0,284.0,98.0,0.645387,65,761.0,260.0,155.0,291.0,600.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,71.0,58.0,47.0,0.543055,2,563.0,277.0,155.0,239.0,541.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,68.0,178.0,76.0,0.314321,92,911.0,461.0,248.0,585.0,927.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,66.0,48.0,15.0,0.698996,15,318.0,158.0,150.0,158.0,391.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1498,61.0,237.0,96.0,0.457791,3,567.0,152.0,115.0,194.0,407.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1499,58.0,234.0,69.0,0.509856,86,543.0,154.0,144.0,199.0,417.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1500,67.0,179.0,37.0,0.463576,90,560.0,144.0,111.0,200.0,370.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1501,73.0,109.0,37.0,0.403134,37,603.0,290.0,180.0,400.0,641.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Seeing columns
data_df.columns

Index(['metascore', 'scenes', 'characters', 'percent dialogue', 'locations',
       'Positive', 'Anger', 'Disgust', 'Fear', 'Negative', 'Sadness',
       'Anticipation', 'Joy', 'Surprise', 'Trust', 'Top 3', 'Action',
       'Adventure', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy',
       'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
       'Columbia Pictures', 'New Line Cinema', 'Paramount Pictures',
       'Twentieth Century Fox', 'Universal Pictures', 'Warner Bros.',
       'Francine Maisler', 'Mary Vernieu', 'Arnold Montey', 'Noah Segura'],
      dtype='object')

In [6]:
# Getting features and label
X_df = data_df.loc[:, data_df.columns != "metascore"]
y_df = data_df["metascore"]

In [7]:
# Finding mutual information of features
from sklearn.feature_selection import mutual_info_regression
mutual_info = mutual_info_regression(X_df, y_df, random_state=42)
print(mutual_info)

[0.         0.         0.05370163 0.         0.         0.
 0.         0.01942722 0.03485256 0.         0.03173371 0.04010766
 0.020077   0.03087146 0.02166473 0.01597074 0.         0.01189092
 0.         0.         0.06108703 0.0082204  0.01813131 0.
 0.         0.         0.         0.         0.         0.
 0.02945084 0.00055743 0.         0.         0.01196222 0.00659478
 0.02354627]


In [8]:
# Dropping columns with 0 mutual info
feature_names = X_df.columns.tolist()

# Finding indices of features with 0 mutual info
indices_to_drop = []
for i in range(len(mutual_info)):
    if (mutual_info[i] == 0):
        indices_to_drop.append(i)

features_to_drop = []
for i in indices_to_drop:
    features_to_drop.append(feature_names[i])
    
# Drop columns
X_df = X_df.drop(features_to_drop, axis=1)

In [9]:
# Getting values of features and labels
X = X_df.values
y = y_df.values
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

Shape of X:  (1496, 18)
Shape of y:  (1496,)


In [10]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Data preprocessing
# Normalization: Min-max normalization
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
minmaxscaler = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = minmaxscaler.transform(X_train)

# transform test data
X_test_norm = minmaxscaler.transform(X_test)

In [12]:
# import libraries for model selection
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error

In [13]:
# hyperparameter values to try
Ks = np.arange(1,100)

In [14]:
# model selection
best_model_mae = None
best_model_r2 = None
best_neg_mae = -np.inf
best_r2 = -np.inf
maes = []
r2s = []

for K in Ks:
    model = KNeighborsRegressor(n_neighbors=K)
    validation_neg_mae = cross_val_score(estimator=model, X=X_train_norm, y=y_train,
                                   scoring="neg_mean_absolute_error", cv=10).mean()
    validation_r2 = cross_val_score(estimator=model, X=X_train_norm, y=y_train,
                                  scoring="r2", cv=10).mean()

    maes.append(abs(validation_neg_mae))
    r2s.append(validation_r2)

    if validation_neg_mae > best_neg_mae:
        best_neg_mae = validation_neg_mae
        best_model_mae = model

    if validation_r2 > best_r2:
        best_r2 = validation_r2
        best_model_r2 = model

print("Best model based on MAE: ", best_model_mae)
print("Best MAE = ", abs(best_neg_mae))
print("Best model based on R2: ", best_model_r2)
print("Best R2 = ", best_r2)

Best model based on MAE:  KNeighborsRegressor(n_neighbors=18)
Best MAE =  13.00558473389356
Best model based on R2:  KNeighborsRegressor(n_neighbors=50)
Best R2 =  0.1362641932067436


In [15]:
best_model_mae = KNeighborsRegressor(n_neighbors=18)
best_model_mae.fit(X_train_norm, y_train)
y_pred_mae = best_model_mae.predict(X_test_norm)
print("Best model based on MAE: \nMAE = ", mean_absolute_error(y_true=y_test, y_pred=y_pred_mae))
print("R2 = ", r2_score(y_true=y_test, y_pred=y_pred_mae))
print("MAPE = ", mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred_mae))

best_model_r2 = KNeighborsRegressor(n_neighbors=50)
best_model_r2.fit(X_train_norm, y_train)
y_pred_r2 = best_model_r2.predict(X_test_norm)
print("\nBest model based on R2: \nMAE = ", mean_absolute_error(y_true=y_test, y_pred=y_pred_r2))
print("R2 = ", r2_score(y_true=y_test, y_pred=y_pred_r2))
print("MAPE = ", mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred_r2))

Best model based on MAE: 
MAE =  13.132777777777779
R2 =  0.13394135564298248
MAPE =  0.28130702191224877

Best model based on R2: 
MAE =  13.026666666666667
R2 =  0.1492424248948624
MAPE =  0.2820852642230883


Results of kNN on data with one-hot-encoding and feature selection using mutual information:
- MAE =  13.132777777777779
- R2 =  0.13394135564298248
- MAPE =  0.28130702191224877