In [None]:
'''
handcrafted_features_{lang}_multiple_experiments: The notebook with SVR + XGBoost and Lasso experiments.
'''

In [5]:
import sys
sys.path.insert(0, "../src/")
import numpy as np
import pandas as pd

extracted_features_dir = "../data/extracted_features/"
labels_dir = "../data/labels/"
lang = "ger"

book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_df.csv")
book_and_averaged_chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_and_averaged_chunk_df.csv")
chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_df.csv")
chunk_and_copied_book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_and_copied_book_df.csv")


In [6]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from copy import deepcopy
from utils import read_labels
from sklearn.metrics import mean_squared_error, mean_absolute_error

labels = read_labels(labels_dir)

class Experiment(object):
    def __init__(self, features, drop_columns_including, dimensionality_reduction, model, verbose):
        assert features in ["book", "chunk", "book_and_averaged_chunk", "chunk_and_copied_book"]
        assert isinstance(drop_columns_including, list)
        for i in drop_columns_including:
            assert isinstance(i, str)
        assert model in ["xgboost", "svr", "lasso"]
        assert (dimensionality_reduction in ["ss_pca_0_95", "k_best_f_reg_0_10", "k_best_mutual_info_0_10"]) or (dimensionality_reduction is None)
        self.features = features
        self.labels = labels
        self.drop_columns_including = drop_columns_including
        self.dimensionality_reduction = dimensionality_reduction
        self.model = model
        self.verbose = verbose

        if features == "book":
            self.df = deepcopy(book_df)
        elif features == "chunk":
            self.df = deepcopy(chunk_df)
        elif features == "chunk_and_copied_book":
            self.df = deepcopy(chunk_and_copied_book_df)
        elif features == "book_and_averaged_chunk":
            self.df = deepcopy(book_and_averaged_chunk_df)

        columns_before_drop = set(self.df.columns)
        self.df = self.df[[column for column in self.df.columns if not self._drop_column(column)]].reset_index(drop=True)
        columns_after_drop = set(self.df.columns)
        if self.verbose:
            print(f"Dropped {len(columns_before_drop - columns_after_drop)} columns.")
        self.df.loc[:, "y"] = self.df.book_name.apply(lambda x: self.labels[x]).tolist()

    def _drop_column(self, column):
        for string in self.drop_columns_including:
            if string in column:
                return True
        return False
    
    def _custom_pca(self, train_X):
        for i in range(5, train_X.shape[1], int((train_X.shape[1] - 5) / 10)):
            pca = PCA(n_components=i)
            new_train_X = pca.fit_transform(train_X)
            if pca.explained_variance_ratio_.sum() >= 0.95:
                break
        return new_train_X, pca

    def _select_features(self, train_X, train_y, validation_X):
        if self.dimensionality_reduction == "ss_pca_0_95":
            ss = StandardScaler()
            train_X = ss.fit_transform(train_X)
            validation_X = ss.transform(validation_X)
            train_X, pca = self._custom_pca(train_X)
            validation_X = pca.transform(validation_X)
        elif self.dimensionality_reduction == "k_best_f_reg_0_10":
            k_best = SelectKBest(f_regression, k=np.minimum(int(0.10 * train_X.shape[0]), train_X.shape[1]))
            train_X = k_best.fit_transform(train_X, train_y)
            validation_X = k_best.transform(validation_X)
        elif self.dimensionality_reduction == "k_best_mutual_info_0_10":
            k_best = SelectKBest(mutual_info_regression, k=np.minimum(int(0.10 * train_X.shape[0]), train_X.shape[1]))
            train_X = k_best.fit_transform(train_X, train_y)
            validation_X = k_best.transform(validation_X)
        elif self.dimensionality_reduction is None:
            pass
        return train_X, validation_X
    
    def _impute(self, train_X, validation_X):
        imputer = KNNImputer()
        train_X = imputer.fit_transform(train_X)
        validation_X = imputer.transform(validation_X)
        return train_X, validation_X
    
    def _get_model(self):
        # if any of these performs better than others, we can try to tune the hyperparameters
        # but I think for now it's more important to see which approach performs better
        # chunk based or doc based
        # use dimensionality reduction or not...
        if self.model == "xgboost":
            return XGBRegressor(n_estimators=1000, max_depth=4, learning_rate=0.01, colsample_bytree=0.33, min_child_weight=6)
        elif self.model == "svr":
            return SVR()
        elif self.model == "lasso":
            return Lasso()
            
    def run(self):
        all_predictions = []
        all_labels = []
        train_mses = []
        train_maes = []
        validation_mses = []
        validation_maes = []

        df = self.df
        book_names = df['book_name'].unique()
        book_names_splitted = np.array_split(book_names, 10)
        for index, split in enumerate(book_names_splitted):
            train_df = df[~df["book_name"].isin(split)]
            validation_df = df[df["book_name"].isin(split)]
            train_X = train_df.drop(columns=["y", "book_name"]).values
            train_y = train_df["y"].values.ravel()
            validation_X = validation_df.drop(columns=["y", "book_name"]).values
            validation_y = validation_df["y"].values.ravel()
            train_X, validation_X = self._impute(train_X, validation_X)
            if self.verbose:
                print(f"train_X.shape before {self.dimensionality_reduction}: {train_X.shape}, validation_X.shape before {self.dimensionality_reduction}: {validation_X.shape}")
            train_X, validation_X = self._select_features(train_X, train_y, validation_X)
            if self.verbose:
                print(f"train_X.shape after {self.dimensionality_reduction}: {train_X.shape}, validation_X.shape after {self.dimensionality_reduction}: {validation_X.shape}")
            model = self._get_model()
            model.fit(train_X, train_y)
            
            train_books = deepcopy(train_df[["book_name", "y"]])
            train_books["yhat"] = model.predict(train_X)
            validation_books = deepcopy(validation_df[["book_name", "y"]])
            validation_books["yhat"] = model.predict(validation_X)
            
            train_books = train_books.groupby("book_name").mean()
            validation_books = validation_books.groupby("book_name").mean()
            
            train_y = train_books["y"].tolist()
            train_yhat = train_books["yhat"].tolist()
            validation_y = validation_books["y"].tolist()
            validation_yhat = validation_books["yhat"].tolist()
            
            all_labels.extend(validation_y)
            all_predictions.extend(validation_yhat)
            
            train_mse = mean_squared_error(train_y, train_yhat)
            train_mae = mean_absolute_error(train_y, train_yhat)
            validation_mse = mean_squared_error(validation_y, validation_yhat)
            validation_mae = mean_absolute_error(validation_y, validation_yhat)
            train_mses.append(train_mse)
            train_maes.append(train_mae)
            validation_mses.append(validation_mse)
            validation_maes.append(validation_mae)
            if self.verbose:
                print(f"Fold: {index+1}, TrainMSE: {np.round(train_mse, 3)}, TrainMAE: {np.round(train_mae, 3)}, ValMSE: {np.round(validation_mse, 3)}, ValMAE: {np.round(validation_mae, 3)}")
        all_labels = np.array(all_labels)
        all_predictions = np.array(all_predictions)

        mean_train_mse = np.mean(train_mses)
        mean_train_mae = np.mean(train_maes)
        mean_validation_mse = np.mean(validation_mses)
        mean_validation_mae = np.mean(validation_maes)
        
        if self.verbose:
            print("------")
            print(f"Mean scores, TrainMSE: {np.round(mean_train_mse, 3)}, TrainMAE: {np.round(mean_train_mae, 3)}, ValMSE: {np.round(mean_validation_mse, 3)}, ValMAE: {np.round(mean_validation_mae, 3)}")

            plt.figure(figsize=(18, 6))
            plt.scatter(all_labels, all_predictions)
            plt.xlabel("Ground Truths")
            plt.ylabel("Predictions")

            plt.show();
        return mean_train_mse, mean_train_mae, mean_validation_mse, mean_validation_mae


In [7]:
results = []

for model in ["xgboost", "lasso", "svr"]:
    for features in ["book", "chunk", "book_and_averaged_chunk", "chunk_and_copied_book"]:
        for drop_columns_including in [["doc2vec_chunk_embedding"], ["average_sentence_embedding"], ["average_sentence_embedding", "doc2vec_chunk_embedding"], []]:
            for dimensionality_reduction in ["k_best_f_reg_0_10", "ss_pca_0_95", "k_best_mutual_info_0_10", None]:
                try:
                    experiment = Experiment(
                        features=features,
                        drop_columns_including=drop_columns_including,
                        dimensionality_reduction=dimensionality_reduction,
                        model=model,
                        verbose=False
                    )
                    train_mse, train_mae, validation_mse, validation_mae = experiment.run()
                    results.append((model, features, drop_columns_including, dimensionality_reduction, train_mse, train_mae, validation_mse, validation_mae))
                    print(model, features, drop_columns_including, dimensionality_reduction, train_mse, train_mae, validation_mse, validation_mae)
                except Exception as e:
                    print(f"Error in {model}, {features}, {drop_columns_including}, {dimensionality_reduction}")
                    print(e)


xgboost book ['doc2vec_chunk_embedding'] k_best_f_reg_0_10 0.004395106511642053 0.05092692407694423 0.040185477975627305 0.16258795326899295
xgboost book ['doc2vec_chunk_embedding'] ss_pca_0_95 0.0011776912780385369 0.02621916002843098 0.040495164287250356 0.16725242747499272
xgboost book ['doc2vec_chunk_embedding'] k_best_mutual_info_0_10 0.004147081527816702 0.04983046213595589 0.04321931183155643 0.1693928574920156
xgboost book ['doc2vec_chunk_embedding'] None 0.0007983102992170177 0.021689960666900935 0.03694347215024534 0.15884452520920178
xgboost book ['average_sentence_embedding'] k_best_f_reg_0_10 0.004395106511642053 0.05092692407694423 0.040185477975627305 0.16258795326899295
xgboost book ['average_sentence_embedding'] ss_pca_0_95 0.0011776912780385369 0.02621916002843098 0.040495164287250356 0.16725242747499272
xgboost book ['average_sentence_embedding'] k_best_mutual_info_0_10 0.0040962145676739295 0.049768781270957005 0.04365493234389655 0.17098115083628876
xgboost book ['

xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_f_reg_0_10 0.0024583196165833695 0.03509264440133396 0.03897736558918146 0.16207895309677173
xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] ss_pca_0_95 0.004482793275751634 0.049384577031932414 0.04176112297048287 0.16789881430923614
xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_mutual_info_0_10 0.0024583196165833695 0.03509264440133396 0.03897736558918146 0.16207895309677173
xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] None 0.0024583196165833695 0.03509264440133396 0.03897736558918146 0.16207895309677173
xgboost chunk_and_copied_book [] k_best_f_reg_0_10 0.002649411573919099 0.03655064165795964 0.03864627318922971 0.1614513541671529
xgboost chunk_and_copied_book [] ss_pca_0_95 0.014907325633593756 0.1003449887710087 0.0404012016844716 0.16795442940136698
xgboos

lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] ss_pca_0_95 0.0528496278591122 0.1995759717632932 0.05303440043751308 0.2000230031932908
lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] k_best_mutual_info_0_10 0.04927321698542199 0.1892678622986396 0.053602162643975826 0.19881491135217183
lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] None 0.04927321698542199 0.18926786229863962 0.05360216264397581 0.19881491135217183
lasso chunk_and_copied_book ['average_sentence_embedding'] k_best_f_reg_0_10 0.04927321698542199 0.1892678622986396 0.053602162643975826 0.19881491135217183
lasso chunk_and_copied_book ['average_sentence_embedding'] ss_pca_0_95 0.0528496278591122 0.1995759717632932 0.05303440043751308 0.2000230031932908
lasso chunk_and_copied_book ['average_sentence_embedding'] k_best_mutual_info_0_10 0.04927321698542199 0.1892678622986396 0.053602162643975826 0.19881491135217183
lasso chunk_and_copied_book ['average_sentence_embedding'] None 0.04927321698542199 0.

svr book_and_averaged_chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] None 0.04749694968949098 0.18491962468971973 0.04844422031097721 0.18733980957967036
svr book_and_averaged_chunk [] k_best_f_reg_0_10 0.04663459370103118 0.1829228849472512 0.04863648258715643 0.18726967913963075
svr book_and_averaged_chunk [] ss_pca_0_95 0.007899227381140664 0.08421214399198342 0.034252870736344185 0.15462065896215896
svr book_and_averaged_chunk [] k_best_mutual_info_0_10 0.034994985965253474 0.15215586397220254 0.05610946767646352 0.1945544845983054
svr book_and_averaged_chunk [] None 0.047498501241027336 0.1849250877195176 0.048440781767798494 0.1873355455575164
svr chunk_and_copied_book ['doc2vec_chunk_embedding'] k_best_f_reg_0_10 0.04744852984935617 0.18403653150026802 0.05087337499497664 0.1916884763132547
svr chunk_and_copied_book ['doc2vec_chunk_embedding'] ss_pca_0_95 0.006373685459671268 0.07512277134534298 0.034660256006429746 0.155499582129009
svr chunk_and_copied_book ['

# Validation MAE medians

In [8]:
results_df = pd.DataFrame(results, columns=["model", "features", "drop_columns_including", "dimensionality_reduction", "train_mse", "train_mae", "validation_mse", "validation_mae"])
results_df["drop_columns_including"] = results_df["drop_columns_including"].apply(str)


In [9]:
results_df.groupby("model").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
model,Unnamed: 1_level_1
lasso,0.197601
svr,0.191566
xgboost,0.164809


In [10]:
results_df.groupby("features").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
features,Unnamed: 1_level_1
book,0.180685
book_and_averaged_chunk,0.185938
chunk,0.191699
chunk_and_copied_book,0.191567


In [11]:
results_df.groupby("drop_columns_including").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
drop_columns_including,Unnamed: 1_level_1
"['average_sentence_embedding', 'doc2vec_chunk_embedding']",0.1897
['average_sentence_embedding'],0.188855
['doc2vec_chunk_embedding'],0.189721
[],0.189202


In [12]:
results_df.groupby("dimensionality_reduction").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
dimensionality_reduction,Unnamed: 1_level_1
k_best_f_reg_0_10,0.191691
k_best_mutual_info_0_10,0.191696
ss_pca_0_95,0.167252


# Best result

In [13]:
results_df[results_df.validation_mae == results_df.validation_mae.min()]

Unnamed: 0,model,features,drop_columns_including,dimensionality_reduction,train_mse,train_mae,validation_mse,validation_mae
149,svr,chunk,['average_sentence_embedding'],ss_pca_0_95,0.005143,0.064595,0.032431,0.148355


In [14]:
results_df.to_csv("../data/results/model_features_drop_columns_including_dimensionality_reduction_ger.csv", index=False)
