Training and testing kNN model on dataset without one-hot-encoded columns and using mutual information for feature selection

In [5]:
import pandas as pd
import numpy as np

In [6]:
# Load data
data_csv = "fulldataset_updated.csv"
data_df = pd.read_csv(data_csv)
data_df = data_df.rename(columns={"Unnamed: 0":"index"})
data_df.head()

Unnamed: 0,index,imdbid,title,metascore,script department,production companies,writers,directors,casting directors,cast,...,Biography,Comedy,Crime,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,0,118661,The Avengers,12.0,"Sharon Mansfield, Anna Worley","Warner Bros., Jerry Weintraub Productions","Sydney Newman, Don MacPherson",Jeremiah S. Chechik,Susie Figgis,"Ralph Fiennes, Uma Thurman, Sean Connery, Patr...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1,215545,Bamboozled,54.0,"Shari L. Carpenter, Carolyn De Sousa","New Line Cinema, 40 Acres & A Mule Filmworks",Spike Lee,Spike Lee,Aisha Coley,"Damon Wayans, Savion Glover, Jada Pinkett Smit...",...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,118715,The Big Lebowski,71.0,T. Kukovinski,"Polygram Filmed Entertainment, Working Title F...","Ethan Coen, Joel Coen","Joel Coen, Ethan Coen",John S. Lyons,"Jeff Bridges, John Goodman, Julianne Moore, St...",...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,112573,Braveheart,68.0,"Sally Jones, Kate Pakenham, Anna Worley","Icon Entertainment International, The Ladd Com...",Randall Wallace,Mel Gibson,Patsy Pollock,"James Robinson, Sean Lawlor, Sandy Nelson, Jam...",...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,88847,The Breakfast Club,66.0,Bob Forrest,"Universal Pictures, A&M Films, Channel Product...",John Hughes,John Hughes,Jackie Burch,"Emilio Estevez, Paul Gleason, Anthony Michael ...",...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Data preprocessing
# Drop unrelated columns
data_df = data_df.drop(["index", "imdbid", "title", "script department",
                        "production companies", "writers", "directors",
                        "casting directors", "cast"], axis=1)

# Drop rows with 0 locations
data_df = data_df[data_df["locations"] != 0]

In [8]:
data_df

Unnamed: 0,metascore,scenes,characters,percent dialogue,locations,Positive,Anger,Disgust,Fear,Negative,...,Biography,Comedy,Crime,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,12.0,256.0,24.0,0.348611,109,596.0,235.0,158.0,402.0,590.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,54.0,284.0,98.0,0.645387,65,761.0,260.0,155.0,291.0,600.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,71.0,58.0,47.0,0.543055,2,563.0,277.0,155.0,239.0,541.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,68.0,178.0,76.0,0.314321,92,911.0,461.0,248.0,585.0,927.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,66.0,48.0,15.0,0.698996,15,318.0,158.0,150.0,158.0,391.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1498,61.0,237.0,96.0,0.457791,3,567.0,152.0,115.0,194.0,407.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1499,58.0,234.0,69.0,0.509856,86,543.0,154.0,144.0,199.0,417.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1500,67.0,179.0,37.0,0.463576,90,560.0,144.0,111.0,200.0,370.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1501,73.0,109.0,37.0,0.403134,37,603.0,290.0,180.0,400.0,641.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [9]:
# Seeing columns
data_df.columns

Index(['metascore', 'scenes', 'characters', 'percent dialogue', 'locations',
       'Positive', 'Anger', 'Disgust', 'Fear', 'Negative', 'Sadness',
       'Anticipation', 'Joy', 'Surprise', 'Trust', 'Top 3', 'Action',
       'Adventure', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy',
       'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller'],
      dtype='object')

In [10]:
# Getting features and label
X_df = data_df.loc[:, data_df.columns != "metascore"]
y_df = data_df["metascore"]

In [11]:
# Finding mutual information of features
from sklearn.feature_selection import mutual_info_regression
mutual_info = mutual_info_regression(X_df, y_df, random_state=42)
print(mutual_info)

[0.         0.         0.04074938 0.         0.         0.
 0.         0.01465435 0.03861587 0.         0.02377908 0.02087821
 0.02261554 0.01348406 0.01170768 0.03785802 0.01661965 0.0093204
 0.         0.         0.0368383  0.00694141 0.03414496 0.
 0.         0.         0.02912408]


In [12]:
# Dropping columns with 0 mutual info
feature_names = X_df.columns.tolist()

# Finding indices of features with 0 mutual info
indices_to_drop = []
for i in range(len(mutual_info)):
    if (mutual_info[i] == 0):
        indices_to_drop.append(i)

features_to_drop = []
for i in indices_to_drop:
    features_to_drop.append(feature_names[i])
    
# Drop columns
X_df = X_df.drop(features_to_drop, axis=1)

In [13]:
# Getting values of features and labels
X = X_df.values
y = y_df.values
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

Shape of X:  (1496, 15)
Shape of y:  (1496,)


In [14]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Data preprocessing
# Normalization: Min-max normalization
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
minmaxscaler = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = minmaxscaler.transform(X_train)

# transform test data
X_test_norm = minmaxscaler.transform(X_test)

In [16]:
# import libraries for model selection
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error

In [17]:
# hyperparameter values to try
Ks = np.arange(1,100)

In [18]:
# model selection
best_model_mae = None
best_model_r2 = None
best_neg_mae = -np.inf
best_r2 = -np.inf
maes = []
r2s = []

for K in Ks:
    model = KNeighborsRegressor(n_neighbors=K)
    validation_neg_mae = cross_val_score(estimator=model, X=X_train_norm, y=y_train,
                                   scoring="neg_mean_absolute_error", cv=10).mean()
    validation_r2 = cross_val_score(estimator=model, X=X_train_norm, y=y_train,
                                  scoring="r2", cv=10).mean()

    maes.append(abs(validation_neg_mae))
    r2s.append(validation_r2)

    if validation_neg_mae > best_neg_mae:
        best_neg_mae = validation_neg_mae
        best_model_mae = model

    if validation_r2 > best_r2:
        best_r2 = validation_r2
        best_model_r2 = model

print("Best model based on MAE: ", best_model_mae)
print("Best MAE = ", abs(best_neg_mae))
print("Best model based on R2: ", best_model_r2)
print("Best R2 = ", best_r2)

Best model based on MAE:  KNeighborsRegressor(n_neighbors=48)
Best MAE =  13.088234856442577
Best model based on R2:  KNeighborsRegressor(n_neighbors=48)
Best R2 =  0.13085465645722497


In [19]:
best_model_mae = KNeighborsRegressor(n_neighbors=48)
best_model_mae.fit(X_train_norm, y_train)
y_pred_mae = best_model_mae.predict(X_test_norm)
print("Best model based on MAE: \nMAE = ", mean_absolute_error(y_true=y_test, y_pred=y_pred_mae))
print("R2 = ", r2_score(y_true=y_test, y_pred=y_pred_mae))
print("MAPE = ", mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred_mae))

best_model_r2 = KNeighborsRegressor(n_neighbors=48)
best_model_r2.fit(X_train_norm, y_train)
y_pred_r2 = best_model_r2.predict(X_test_norm)
print("\nBest model based on R2: \nMAE = ", mean_absolute_error(y_true=y_test, y_pred=y_pred_r2))
print("R2 = ", r2_score(y_true=y_test, y_pred=y_pred_r2))
print("MAPE = ", mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred_r2))

Best model based on MAE: 
MAE =  13.038680555555555
R2 =  0.14558476676650045
MAPE =  0.28623948299954494

Best model based on R2: 
MAE =  13.038680555555555
R2 =  0.14558476676650045
MAPE =  0.28623948299954494


Results of kNN on data without one-hot-encoding and feature selection using mutual information:
- MAE =  13.038680555555555
- R2 =  0.14558476676650045
- MAPE =  0.28623948299954494