In [None]:
'''
handcrafted_features_{lang}_multiple_experiments: The notebook with SVR + XGBoost and Lasso experiments.
''''

In [1]:
import sys
sys.path.insert(0, "../src/")
import numpy as np
import pandas as pd

extracted_features_dir = "../data/extracted_features/"
labels_dir = "../data/labels/"
lang = "eng"

book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_df.csv")
book_and_averaged_chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_and_averaged_chunk_df.csv")
chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_df.csv")
chunk_and_copied_book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_and_copied_book_df.csv")


In [6]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from copy import deepcopy
from utils import read_labels
from sklearn.metrics import mean_squared_error, mean_absolute_error

labels = read_labels(labels_dir)

class Experiment(object):
    def __init__(self, features, drop_columns_including, dimensionality_reduction, model, verbose):
        assert features in ["book", "chunk", "book_and_averaged_chunk", "chunk_and_copied_book"]
        assert isinstance(drop_columns_including, list)
        for i in drop_columns_including:
            assert isinstance(i, str)
        assert model in ["xgboost", "svr", "lasso"]
        assert (dimensionality_reduction in ["ss_pca_0_95", "k_best_f_reg_0_10", "k_best_mutual_info_0_10"]) or (dimensionality_reduction is None)
        self.features = features
        self.labels = labels
        self.drop_columns_including = drop_columns_including
        self.dimensionality_reduction = dimensionality_reduction
        self.model = model
        self.verbose = verbose

        if features == "book":
            self.df = deepcopy(book_df)
        elif features == "chunk":
            self.df = deepcopy(chunk_df)
        elif features == "chunk_and_copied_book":
            self.df = deepcopy(chunk_and_copied_book_df)
        elif features == "book_and_averaged_chunk":
            self.df = deepcopy(book_and_averaged_chunk_df)

        columns_before_drop = set(self.df.columns)
        self.df = self.df[[column for column in self.df.columns if not self._drop_column(column)]].reset_index(drop=True)
        columns_after_drop = set(self.df.columns)
        if self.verbose:
            print(f"Dropped {len(columns_before_drop - columns_after_drop)} columns.")
        self.df.loc[:, "y"] = self.df.book_name.apply(lambda x: self.labels[x]).tolist()

    def _drop_column(self, column):
        for string in self.drop_columns_including:
            if string in column:
                return True
        return False
    
    def _custom_pca(self, train_X):
        for i in range(5, train_X.shape[1], int((train_X.shape[1] - 5) / 10)):
            pca = PCA(n_components=i)
            new_train_X = pca.fit_transform(train_X)
            if pca.explained_variance_ratio_.sum() >= 0.95:
                break
        return new_train_X, pca

    def _select_features(self, train_X, train_y, validation_X):
        if self.dimensionality_reduction == "ss_pca_0_95":
            ss = StandardScaler()
            train_X = ss.fit_transform(train_X)
            validation_X = ss.transform(validation_X)
            train_X, pca = self._custom_pca(train_X)
            validation_X = pca.transform(validation_X)
        elif self.dimensionality_reduction == "k_best_f_reg_0_10":
            k_best = SelectKBest(f_regression, k=np.minimum(int(0.10 * train_X.shape[0]), train_X.shape[1]))
            train_X = k_best.fit_transform(train_X, train_y)
            validation_X = k_best.transform(validation_X)
        elif self.dimensionality_reduction == "k_best_mutual_info_0_10":
            k_best = SelectKBest(mutual_info_regression, k=np.minimum(int(0.10 * train_X.shape[0]), train_X.shape[1]))
            train_X = k_best.fit_transform(train_X, train_y)
            validation_X = k_best.transform(validation_X)
        elif self.dimensionality_reduction is None:
            pass
        return train_X, validation_X
    
    def _impute(self, train_X, validation_X):
        imputer = KNNImputer()
        train_X = imputer.fit_transform(train_X)
        validation_X = imputer.transform(validation_X)
        return train_X, validation_X
    
    def _get_model(self):
        # if any of these performs better than others, we can try to tune the hyperparameters
        # but I think for now it's more important to see which approach performs better
        # chunk based or doc based
        # use dimensionality reduction or not...
        if self.model == "xgboost":
            return XGBRegressor(n_estimators=1000, max_depth=4, learning_rate=0.01, colsample_bytree=0.33, min_child_weight=6)
        elif self.model == "svr":
            return SVR()
        elif self.model == "lasso":
            return Lasso()
            
    def run(self):
        all_predictions = []
        all_labels = []
        train_mses = []
        train_maes = []
        validation_mses = []
        validation_maes = []

        df = self.df
        book_names = df['book_name'].unique()
        book_names_splitted = np.array_split(book_names, 10)
        for index, split in enumerate(book_names_splitted):
            train_df = df[~df["book_name"].isin(split)]
            validation_df = df[df["book_name"].isin(split)]
            train_X = train_df.drop(columns=["y", "book_name"]).values
            train_y = train_df["y"].values.ravel()
            validation_X = validation_df.drop(columns=["y", "book_name"]).values
            validation_y = validation_df["y"].values.ravel()
            train_X, validation_X = self._impute(train_X, validation_X)
            if self.verbose:
                print(f"train_X.shape before {self.dimensionality_reduction}: {train_X.shape}, validation_X.shape before {self.dimensionality_reduction}: {validation_X.shape}")
            train_X, validation_X = self._select_features(train_X, train_y, validation_X)
            if self.verbose:
                print(f"train_X.shape after {self.dimensionality_reduction}: {train_X.shape}, validation_X.shape after {self.dimensionality_reduction}: {validation_X.shape}")
            model = self._get_model()
            model.fit(train_X, train_y)
            
            train_books = deepcopy(train_df[["book_name", "y"]])
            train_books["yhat"] = model.predict(train_X)
            validation_books = deepcopy(validation_df[["book_name", "y"]])
            validation_books["yhat"] = model.predict(validation_X)
            
            train_books = train_books.groupby("book_name").mean()
            validation_books = validation_books.groupby("book_name").mean()
            
            train_y = train_books["y"].tolist()
            train_yhat = train_books["yhat"].tolist()
            validation_y = validation_books["y"].tolist()
            validation_yhat = validation_books["yhat"].tolist()
            
            all_labels.extend(validation_y)
            all_predictions.extend(validation_yhat)
            
            train_mse = mean_squared_error(train_y, train_yhat)
            train_mae = mean_absolute_error(train_y, train_yhat)
            validation_mse = mean_squared_error(validation_y, validation_yhat)
            validation_mae = mean_absolute_error(validation_y, validation_yhat)
            train_mses.append(train_mse)
            train_maes.append(train_mae)
            validation_mses.append(validation_mse)
            validation_maes.append(validation_mae)
            if self.verbose:
                print(f"Fold: {index+1}, TrainMSE: {np.round(train_mse, 3)}, TrainMAE: {np.round(train_mae, 3)}, ValMSE: {np.round(validation_mse, 3)}, ValMAE: {np.round(validation_mae, 3)}")
        all_labels = np.array(all_labels)
        all_predictions = np.array(all_predictions)

        mean_train_mse = np.mean(train_mses)
        mean_train_mae = np.mean(train_maes)
        mean_validation_mse = np.mean(validation_mses)
        mean_validation_mae = np.mean(validation_maes)
        
        if self.verbose:
            print("------")
            print(f"Mean scores, TrainMSE: {np.round(mean_train_mse, 3)}, TrainMAE: {np.round(mean_train_mae, 3)}, ValMSE: {np.round(mean_validation_mse, 3)}, ValMAE: {np.round(mean_validation_mae, 3)}")

            plt.figure(figsize=(18, 6))
            plt.scatter(all_labels, all_predictions)
            plt.xlabel("Ground Truths")
            plt.ylabel("Predictions")

            plt.show();
        return mean_train_mse, mean_train_mae, mean_validation_mse, mean_validation_mae


In [7]:
results = []

for model in ["xgboost", "lasso", "svr"]:
    for features in ["book", "chunk", "book_and_averaged_chunk", "chunk_and_copied_book"]:
        for drop_columns_including in [["doc2vec_chunk_embedding"], ["average_sentence_embedding"], ["average_sentence_embedding", "doc2vec_chunk_embedding"], []]:
            for dimensionality_reduction in ["k_best_f_reg_0_10", "ss_pca_0_95", "k_best_mutual_info_0_10", None]:
                try:
                    experiment = Experiment(
                        features=features,
                        drop_columns_including=drop_columns_including,
                        dimensionality_reduction=dimensionality_reduction,
                        model=model,
                        verbose=False
                    )
                    train_mse, train_mae, validation_mse, validation_mae = experiment.run()
                    results.append((model, features, drop_columns_including, dimensionality_reduction, train_mse, train_mae, validation_mse, validation_mae))
                    print(model, features, drop_columns_including, dimensionality_reduction, train_mse, train_mae, validation_mse, validation_mae)
                except Exception as e:
                    print(f"Error in {model}, {features}, {drop_columns_including}, {dimensionality_reduction}")
                    print(e)


xgboost book ['doc2vec_chunk_embedding'] k_best_f_reg_0_10 0.007133408285458613 0.0656092346486499 0.05977546238405636 0.20166857270415464
xgboost book ['doc2vec_chunk_embedding'] ss_pca_0_95 0.0016210183090117516 0.030405676496936866 0.0495283860605417 0.187738443176378
xgboost book ['doc2vec_chunk_embedding'] k_best_mutual_info_0_10 0.006273830491713593 0.06201821568921775 0.05083688526082479 0.18770011694352967
xgboost book ['doc2vec_chunk_embedding'] None 0.0016624482070526163 0.031067013389104326 0.04612340627268315 0.17566151219248213
xgboost book ['average_sentence_embedding'] k_best_f_reg_0_10 0.007133408285458613 0.0656092346486499 0.05977546238405636 0.20166857270415464
xgboost book ['average_sentence_embedding'] ss_pca_0_95 0.0016390389217485677 0.03069648753860415 0.0500785703997217 0.1895349807527365
xgboost book ['average_sentence_embedding'] k_best_mutual_info_0_10 0.006555128746860775 0.0632169839139057 0.05083808425992638 0.18696752956466273
xgboost book ['average_sent

xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_f_reg_0_10 0.003496337613327732 0.04137054851123774 0.046726785296199655 0.18134882433624933
xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] ss_pca_0_95 0.00667763461001052 0.06148068387284082 0.050091518884537486 0.18893879621794973
xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_mutual_info_0_10 0.003496337613327732 0.04137054851123774 0.046726785296199655 0.18134882433624933
xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] None 0.003496337613327732 0.04137054851123774 0.046726785296199655 0.18134882433624933
xgboost chunk_and_copied_book [] k_best_f_reg_0_10 0.003478012658267012 0.04115119005358562 0.04645699691887867 0.18043869785890737
xgboost chunk_and_copied_book [] ss_pca_0_95 0.02331329980551274 0.12876361439676337 0.05232317350572098 0.19652471708985264
xgboo

lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] ss_pca_0_95 0.07141060083026177 0.23526281720855602 0.07148610656755536 0.23539624272318652
lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] k_best_mutual_info_0_10 0.06431562515832719 0.21919303557227904 0.06770319930963595 0.22540528250737157
lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] None 0.06431562515832719 0.21919303557227904 0.06770319930963595 0.22540528250737157
lasso chunk_and_copied_book ['average_sentence_embedding'] k_best_f_reg_0_10 0.06431562515832719 0.21919303557227904 0.06770319930963595 0.22540528250737157
lasso chunk_and_copied_book ['average_sentence_embedding'] ss_pca_0_95 0.07141060083026177 0.23526281720855602 0.07148610656755536 0.23539624272318652
lasso chunk_and_copied_book ['average_sentence_embedding'] k_best_mutual_info_0_10 0.06431562515832719 0.21919303557227904 0.06770319930963595 0.22540528250737157
lasso chunk_and_copied_book ['average_sentence_embedding'] None 0.06431562515832

svr book_and_averaged_chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] None 0.06668238100846155 0.22380352609924364 0.06803514625867442 0.22584686374350413
svr book_and_averaged_chunk [] k_best_f_reg_0_10 0.067335024054166 0.22518204280930615 0.0686342229322725 0.2271497442813239
svr book_and_averaged_chunk [] ss_pca_0_95 0.008760562744018702 0.08775512641531266 0.04473491218005493 0.17608992087223302
svr book_and_averaged_chunk [] k_best_mutual_info_0_10 0.06712908164770026 0.22473326197611848 0.06835255473585128 0.2266082644899648
svr book_and_averaged_chunk [] None 0.06668631960964425 0.2237992864635979 0.06803006977610852 0.22582621836970523
svr chunk_and_copied_book ['doc2vec_chunk_embedding'] k_best_f_reg_0_10 0.06311238708128833 0.21945283887757583 0.0667885852993163 0.22578675467208753
svr chunk_and_copied_book ['doc2vec_chunk_embedding'] ss_pca_0_95 0.006416402616094307 0.07546138585535112 0.04535496027898896 0.17838491605340673
svr chunk_and_copied_book ['doc2v

# Validation MAE medians

In [8]:
results_df = pd.DataFrame(results, columns=["model", "features", "drop_columns_including", "dimensionality_reduction", "train_mse", "train_mae", "validation_mse", "validation_mae"])
results_df["drop_columns_including"] = results_df["drop_columns_including"].apply(str)


In [9]:
results_df.groupby("model").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
model,Unnamed: 1_level_1
lasso,0.229488
svr,0.225771
xgboost,0.188339


In [10]:
results_df.groupby("features").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
features,Unnamed: 1_level_1
book,0.210961
book_and_averaged_chunk,0.218161
chunk,0.233695
chunk_and_copied_book,0.225405


In [11]:
results_df.groupby("drop_columns_including").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
drop_columns_including,Unnamed: 1_level_1
"['average_sentence_embedding', 'doc2vec_chunk_embedding']",0.223596
['average_sentence_embedding'],0.220751
['doc2vec_chunk_embedding'],0.220822
[],0.220647


In [12]:
results_df.groupby("dimensionality_reduction").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
dimensionality_reduction,Unnamed: 1_level_1
k_best_f_reg_0_10,0.22636
k_best_mutual_info_0_10,0.223787
ss_pca_0_95,0.193704


# Best result

In [13]:
results_df[results_df.validation_mae == results_df.validation_mae.min()]

Unnamed: 0,model,features,drop_columns_including,dimensionality_reduction,train_mse,train_mae,validation_mse,validation_mae
165,svr,book_and_averaged_chunk,['average_sentence_embedding'],ss_pca_0_95,0.008561,0.086624,0.041147,0.168906


In [14]:
results_df.to_csv("../data/results/model_features_drop_columns_including_dimensionality_reduction_eng.csv", index=False)
