In [1]:
import os
import sys
sys.path.insert(0, "../src/")
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from handcrafted_features import DocBasedFeatureExtractor, Doc2VecChunkVectorizer, CorpusBasedFeatureExtractor
from utils import get_doc_paths, read_labels

raw_docs_dir = "../data/raw_docs/"
labels_dir = "../data/labels/"
extracted_features_dir = "../data/extracted_features/"

lang = "ger"
doc_paths = get_doc_paths(raw_docs_dir, lang)

sentences_per_chunk = 200
# d2vcv = Doc2VecChunkVectorizer(lang, sentences_per_chunk)
# d2vcv.fit_transform(doc_paths)




In [2]:
all_chunk_based_features = []
all_book_based_features = []
all_average_sbert_sentence_embeddings = []
all_doc2vec_chunk_embeddings = []
for doc_path in tqdm(doc_paths):
    fe = DocBasedFeatureExtractor(lang, doc_path, sentences_per_chunk)
    chunk_based_features, book_based_features, average_sbert_sentence_embeddings, doc2vec_chunk_embeddings = fe.get_all_features()
    all_chunk_based_features.extend(chunk_based_features)
    all_book_based_features.append(book_based_features)
    all_average_sbert_sentence_embeddings.append(average_sbert_sentence_embeddings)
    all_doc2vec_chunk_embeddings.append(doc2vec_chunk_embeddings)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 547/547 [59:11<00:00,  6.49s/it]  


In [4]:
%load_ext autoreload
%autoreload 2


In [6]:
cbfe = CorpusBasedFeatureExtractor(lang, doc_paths, all_average_sbert_sentence_embeddings, all_doc2vec_chunk_embeddings)
all_corpus_based_features = cbfe.get_all_features()


100%|██████████| 547/547 [05:29<00:00,  1.66it/s]
100%|██████████| 547/547 [03:56<00:00,  2.31it/s]
100%|██████████| 547/547 [01:01<00:00,  8.87it/s]
100%|██████████| 547/547 [00:44<00:00, 12.31it/s]
INFO:gensim.models.ldamodel:using symmetric alpha at 0.1
INFO:gensim.models.ldamodel:using symmetric eta at 0.1
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamulticore:running online LDA training, 10 topics, 2 passes over the supplied corpus of 547 documents, updating every 6000 documents, evaluating every ~547 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamulticore:training LDA model using 3 processes
INFO:gensim.models.ldamulticore:PROGRESS: pass 0, dispatched chunk #0 = documents up to #547/547, outstanding queue size 1
DEBUG:gensim.models.ldamodel:updating topics
INFO:gensim.models.ldamodel:topic #5 (0.100): 0.003*"seyn" + 0.002*"bey" + 0.001*"wol" + 0.001*"anna" + 0.001*"nit" + 0.001*"vogt" + 0.001*"ka

In [7]:
import os
import pandas as pd

book_df = pd.DataFrame(all_book_based_features)
book_df = book_df.merge(all_corpus_based_features, on="book_name")
book_and_averaged_chunk_df = book_df.merge(pd.DataFrame(all_chunk_based_features).groupby("book_name").mean().reset_index(drop=False), on="book_name")

chunk_df = pd.DataFrame(all_chunk_based_features)
chunk_and_copied_book_df = chunk_df.merge(pd.DataFrame(all_book_based_features), on="book_name")
chunk_and_copied_book_df = chunk_and_copied_book_df.merge(all_corpus_based_features, on="book_name")

os.makedirs(f"{extracted_features_dir}/{lang}", exist_ok=True)
book_df.to_csv(f"{extracted_features_dir}/{lang}/book_df.csv", index=False)
book_and_averaged_chunk_df.to_csv(f"{extracted_features_dir}/{lang}/book_and_averaged_chunk_df.csv", index=False)
chunk_df.to_csv(f"{extracted_features_dir}/{lang}/chunk_df.csv", index=False)
chunk_and_copied_book_df.to_csv(f"{extracted_features_dir}/{lang}/chunk_and_copied_book_df.csv", index=False)


In [8]:
import sys
sys.path.insert(0, "../src/")
import numpy as np
import pandas as pd

extracted_features_dir = "../data/extracted_features/"
labels_dir = "../data/labels/"
lang = "ger"

book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_df.csv")
book_and_averaged_chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_and_averaged_chunk_df.csv")
chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_df.csv")
chunk_and_copied_book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_and_copied_book_df.csv")


In [9]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from copy import deepcopy
from utils import read_labels
from sklearn.metrics import mean_squared_error, mean_absolute_error

labels = read_labels(labels_dir, lang)

class Experiment(object):
    def __init__(self, features, drop_columns_including, dimensionality_reduction, model, verbose):
        assert features in ["book", "chunk", "book_and_averaged_chunk", "chunk_and_copied_book"]
        assert isinstance(drop_columns_including, list)
        for i in drop_columns_including:
            assert isinstance(i, str)
        assert model in ["xgboost", "svr", "lasso"]
        assert (dimensionality_reduction in ["ss_pca_0_95", "k_best_f_reg_0_10", "k_best_mutual_info_0_10"]) or (dimensionality_reduction is None)
        self.features = features
        self.labels = labels
        self.drop_columns_including = drop_columns_including
        self.dimensionality_reduction = dimensionality_reduction
        self.model = model
        self.verbose = verbose

        if features == "book":
            self.df = deepcopy(book_df)
        elif features == "chunk":
            self.df = deepcopy(chunk_df)
        elif features == "chunk_and_copied_book":
            self.df = deepcopy(chunk_and_copied_book_df)
        elif features == "book_and_averaged_chunk":
            self.df = deepcopy(book_and_averaged_chunk_df)

        columns_before_drop = set(self.df.columns)
        self.df = self.df[[column for column in self.df.columns if not self._drop_column(column)]].reset_index(drop=True)
        columns_after_drop = set(self.df.columns)
        if self.verbose:
            print(f"Dropped {len(columns_before_drop - columns_after_drop)} columns.")
        self.df.loc[:, "y"] = self.df.book_name.apply(lambda x: self.labels[x]).tolist()

    def _drop_column(self, column):
        for string in self.drop_columns_including:
            if string in column:
                return True
        return False
    
    def _custom_pca(self, train_X):
        for i in range(5, train_X.shape[1], int((train_X.shape[1] - 5) / 10)):
            pca = PCA(n_components=i)
            new_train_X = pca.fit_transform(train_X)
            if pca.explained_variance_ratio_.sum() >= 0.95:
                break
        return new_train_X, pca

    def _select_features(self, train_X, train_y, validation_X):
        if self.dimensionality_reduction == "ss_pca_0_95":
            ss = StandardScaler()
            train_X = ss.fit_transform(train_X)
            validation_X = ss.transform(validation_X)
            train_X, pca = self._custom_pca(train_X)
            validation_X = pca.transform(validation_X)
        elif self.dimensionality_reduction == "k_best_f_reg_0_10":
            k_best = SelectKBest(f_regression, k=np.minimum(int(0.10 * train_X.shape[0]), train_X.shape[1]))
            train_X = k_best.fit_transform(train_X, train_y)
            validation_X = k_best.transform(validation_X)
        elif self.dimensionality_reduction == "k_best_mutual_info_0_10":
            k_best = SelectKBest(mutual_info_regression, k=np.minimum(int(0.10 * train_X.shape[0]), train_X.shape[1]))
            train_X = k_best.fit_transform(train_X, train_y)
            validation_X = k_best.transform(validation_X)
        elif self.dimensionality_reduction is None:
            pass
        return train_X, validation_X
    
    def _impute(self, train_X, validation_X):
        imputer = KNNImputer()
        train_X = imputer.fit_transform(train_X)
        validation_X = imputer.transform(validation_X)
        return train_X, validation_X
    
    def _get_model(self):
        # if any of these performs better than others, we can try to tune the hyperparameters
        # but I think for now it's more important to see which approach performs better
        # chunk based or doc based
        # use dimensionality reduction or not...
        if self.model == "xgboost":
            return XGBRegressor()
        elif self.model == "svr":
            return SVR()
        elif self.model == "lasso":
            return Lasso()
            
    def run(self):
        all_predictions = []
        all_labels = []
        train_mses = []
        train_maes = []
        validation_mses = []
        validation_maes = []

        df = self.df
        book_names = df['book_name'].unique()
        book_names_splitted = np.array_split(book_names, 10)
        for index, split in enumerate(book_names_splitted):
            train_df = df[~df["book_name"].isin(split)]
            validation_df = df[df["book_name"].isin(split)]
            train_X = train_df.drop(columns=["y", "book_name"]).values
            train_y = train_df["y"].values.ravel()
            validation_X = validation_df.drop(columns=["y", "book_name"]).values
            validation_y = validation_df["y"].values.ravel()
            train_X, validation_X = self._impute(train_X, validation_X)
            if self.verbose:
                print(f"train_X.shape before {self.dimensionality_reduction}: {train_X.shape}, validation_X.shape before {self.dimensionality_reduction}: {validation_X.shape}")
            train_X, validation_X = self._select_features(train_X, train_y, validation_X)
            if self.verbose:
                print(f"train_X.shape after {self.dimensionality_reduction}: {train_X.shape}, validation_X.shape after {self.dimensionality_reduction}: {validation_X.shape}")
            model = self._get_model()
            model.fit(train_X, train_y)
            
            train_books = deepcopy(train_df[["book_name", "y"]])
            train_books["yhat"] = model.predict(train_X)
            validation_books = deepcopy(validation_df[["book_name", "y"]])
            validation_books["yhat"] = model.predict(validation_X)
            
            train_books = train_books.groupby("book_name").mean()
            validation_books = validation_books.groupby("book_name").mean()
            
            train_y = train_books["y"].tolist()
            train_yhat = train_books["yhat"].tolist()
            validation_y = validation_books["y"].tolist()
            validation_yhat = validation_books["yhat"].tolist()
            
            all_labels.extend(validation_y)
            all_predictions.extend(validation_yhat)
            
            train_mse = mean_squared_error(train_y, train_yhat)
            train_mae = mean_absolute_error(train_y, train_yhat)
            validation_mse = mean_squared_error(validation_y, validation_yhat)
            validation_mae = mean_absolute_error(validation_y, validation_yhat)
            train_mses.append(train_mse)
            train_maes.append(train_mae)
            validation_mses.append(validation_mse)
            validation_maes.append(validation_mae)
            if self.verbose:
                print(f"Fold: {index+1}, TrainMSE: {np.round(train_mse, 3)}, TrainMAE: {np.round(train_mae, 3)}, ValMSE: {np.round(validation_mse, 3)}, ValMAE: {np.round(validation_mae, 3)}")
        all_labels = np.array(all_labels)
        all_predictions = np.array(all_predictions)

        mean_train_mse = np.mean(train_mses)
        mean_train_mae = np.mean(train_maes)
        mean_validation_mse = np.mean(validation_mses)
        mean_validation_mae = np.mean(validation_maes)
        
        if self.verbose:
            print("------")
            print(f"Mean scores, TrainMSE: {np.round(mean_train_mse, 3)}, TrainMAE: {np.round(mean_train_mae, 3)}, ValMSE: {np.round(mean_validation_mse, 3)}, ValMAE: {np.round(mean_validation_mae, 3)}")

            plt.figure(figsize=(18, 6))
            plt.scatter(all_labels, all_predictions)
            plt.xlabel("Ground Truths")
            plt.ylabel("Predictions")

            plt.show();
        return mean_train_mse, mean_train_mae, mean_validation_mse, mean_validation_mae


DEBUG:matplotlib:matplotlib data path: /Users/arda/conda_root/lib/python3.8/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/Users/arda/.matplotlib
DEBUG:matplotlib:matplotlib version 3.4.2
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is darwin


DEBUG:matplotlib:CACHEDIR=/Users/arda/.matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /Users/arda/.matplotlib/fontlist-v330.json
DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.
DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [10]:
results = []

for model in ["xgboost", "lasso", "svr"]:
    for features in ["book", "chunk", "book_and_averaged_chunk", "chunk_and_copied_book"]:
        for drop_columns_including in [["doc2vec_chunk_embedding"], ["average_sentence_embedding"], ["average_sentence_embedding", "doc2vec_chunk_embedding"], []]:
            for dimensionality_reduction in ["k_best_f_reg_0_10", "ss_pca_0_95", "k_best_mutual_info_0_10", None]:
                try:
                    experiment = Experiment(
                        features=features,
                        drop_columns_including=drop_columns_including,
                        dimensionality_reduction=dimensionality_reduction,
                        model=model,
                        verbose=False
                    )
                    train_mse, train_mae, validation_mse, validation_mae = experiment.run()
                    results.append((model, features, drop_columns_including, dimensionality_reduction, train_mse, train_mae, validation_mse, validation_mae))
                    print(model, features, drop_columns_including, dimensionality_reduction, train_mse, train_mae, validation_mse, validation_mae)
                except Exception as e:
                    print(f"Error in {model}, {features}, {drop_columns_including}, {dimensionality_reduction}")
                    print(e)


xgboost book ['doc2vec_chunk_embedding'] k_best_f_reg_0_10 4.807713217890235e-05 0.004537267108704961 513.1769281216007 18.033354267502393
xgboost book ['doc2vec_chunk_embedding'] ss_pca_0_95 1.6840018916110106e-07 0.0002819766959541119 477.2452824119606 17.09163827824862
xgboost book ['doc2vec_chunk_embedding'] k_best_mutual_info_0_10 2.7690748633603928e-05 0.0035070224199384506 464.61438151991024 17.174451790482557
xgboost book ['doc2vec_chunk_embedding'] None 1.5545129259084826e-07 0.0002857964579808374 449.5632570701091 16.587801288716502
xgboost book ['average_sentence_embedding'] k_best_f_reg_0_10 4.807713217890235e-05 0.004537267108704961 513.1769281216007 18.033354267502393
xgboost book ['average_sentence_embedding'] ss_pca_0_95 1.6840018916110106e-07 0.0002819766959541119 477.2452824119606 17.09163827824862
xgboost book ['average_sentence_embedding'] k_best_mutual_info_0_10 2.4857977585706182e-05 0.0033357986357484383 479.6701222332387 17.54422914242373
xgboost book ['average_

xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] ss_pca_0_95 0.031007798899965106 0.112649696823188 472.61147102582345 17.166272903673097
xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_mutual_info_0_10 2.6176621315105276e-05 0.003037159366923239 454.49476111114836 16.9610045832626
xgboost chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] None 2.6176621315105276e-05 0.003037159366923239 454.49476111114836 16.9610045832626
xgboost chunk_and_copied_book [] k_best_f_reg_0_10 3.5768181650435146e-05 0.00368765052008622 440.05480188035847 16.67976346876468
xgboost chunk_and_copied_book [] ss_pca_0_95 0.2985183631441851 0.4024918250970685 429.57238062020525 16.29042142508256
xgboost chunk_and_copied_book [] k_best_mutual_info_0_10 3.5155311296623486e-05 0.003454961903356102 448.9212695125493 16.85710398580677
xgboost chunk_and_copied_book [] None 3.963593033215105e-05 0.00377827

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk ['doc2vec_chunk_embedding'] k_best_f_reg_0_10 467.82612423451246 17.012144697068443 473.28370467520745 17.093307770166163
lasso chunk ['doc2vec_chunk_embedding'] ss_pca_0_95 399.2319671365836 15.854523580637059 435.78429062574634 16.585871620367108


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk ['doc2vec_chunk_embedding'] k_best_mutual_info_0_10 467.82612423451246 17.012144697068443 473.28370467520745 17.093307770166163


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk ['doc2vec_chunk_embedding'] None 467.82612423451246 17.012144697068443 473.28370467520745 17.093307770166163


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk ['average_sentence_embedding'] k_best_f_reg_0_10 403.5479406945227 15.987816511092253 427.56991312098006 16.441099599766886
lasso chunk ['average_sentence_embedding'] ss_pca_0_95 412.25530790244136 16.2521871281673 425.18065038014254 16.464706537059403


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk ['average_sentence_embedding'] k_best_mutual_info_0_10 403.5479406945227 15.987816511092253 427.56991312098006 16.441099599766886


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk ['average_sentence_embedding'] None 403.5479406945227 15.987816511092253 427.56991312098006 16.441099599766886


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_f_reg_0_10 467.82612423451246 17.012144697068443 473.28370467520745 17.093307770166163
lasso chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] ss_pca_0_95 464.0811773591593 16.915851477700368 471.13307295692374 17.040575144522307


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_mutual_info_0_10 467.82612423451246 17.012144697068443 473.28370467520745 17.093307770166163


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] None 467.82612423451246 17.012144697068443 473.28370467520745 17.093307770166163


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk [] k_best_f_reg_0_10 403.5479406945227 15.987816511092253 427.56991312098006 16.441099599766886
lasso chunk [] ss_pca_0_95 381.57472045021234 15.543827915579206 413.8786328669679 16.2006158904936


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk [] k_best_mutual_info_0_10 403.5479406945227 15.987816511092253 427.56991312098006 16.441099599766886


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk [] None 403.5479406945227 15.987816511092253 427.56991312098006 16.441099599766886


  model = cd_fast.enet_coordinate_descent(


lasso book_and_averaged_chunk ['doc2vec_chunk_embedding'] k_best_f_reg_0_10 444.93774722362303 16.981282463388144 467.24670200816473 17.407150118692822
lasso book_and_averaged_chunk ['doc2vec_chunk_embedding'] ss_pca_0_95 265.19750048005636 12.864414586398036 418.8426561713289 16.081815303197565
lasso book_and_averaged_chunk ['doc2vec_chunk_embedding'] k_best_mutual_info_0_10 442.84340764058845 16.992081719658394 465.920765047596 17.380298731659767


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso book_and_averaged_chunk ['doc2vec_chunk_embedding'] None 375.45143903226204 15.359424753350718 489.2201886005167 17.574802411637176


  model = cd_fast.enet_coordinate_descent(


lasso book_and_averaged_chunk ['average_sentence_embedding'] k_best_f_reg_0_10 384.7872578206876 15.690702949345166 418.8049347982117 16.331619605158966
lasso book_and_averaged_chunk ['average_sentence_embedding'] ss_pca_0_95 306.93187544300656 13.89801101540715 411.0274257676368 16.12780216137414
lasso book_and_averaged_chunk ['average_sentence_embedding'] k_best_mutual_info_0_10 429.2164248759036 16.721685040584656 464.69980595267145 17.36683544705584


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso book_and_averaged_chunk ['average_sentence_embedding'] None 328.1447439582515 14.324337769671782 462.3190285676874 16.95586457344878


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso book_and_averaged_chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_f_reg_0_10 435.6254205288222 16.74451505164132 476.5551011875633 17.514071886749328
lasso book_and_averaged_chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] ss_pca_0_95 329.30961509311186 14.316326470016145 430.8088361996937 16.352501055072015
lasso book_and_averaged_chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_mutual_info_0_10 435.23814947042456 16.83773492794412 463.5040504095009 17.35472472804892


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso book_and_averaged_chunk ['average_sentence_embedding', 'doc2vec_chunk_embedding'] None 375.45143903226204 15.359424753350718 489.22018860051674 17.574802411637176


  model = cd_fast.enet_coordinate_descent(


lasso book_and_averaged_chunk [] k_best_f_reg_0_10 388.68948592546667 15.754521901856961 417.28834310398054 16.317199845204033
lasso book_and_averaged_chunk [] ss_pca_0_95 258.00528319957664 12.745650060644675 409.27604170283564 15.978608228034116
lasso book_and_averaged_chunk [] k_best_mutual_info_0_10 438.1200159375676 16.891957422378614 461.7051627652574 17.227324693538087


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso book_and_averaged_chunk [] None 328.1447439582515 14.324337769671782 462.3190285676874 16.95586457344878


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] k_best_f_reg_0_10 462.2276764243141 16.9184654316702 578.5786629690958 19.112443928726794
lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] ss_pca_0_95 362.4645250293892 14.877860172459318 444.80710394833915 16.789715300949435


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] k_best_mutual_info_0_10 464.6575337475425 16.945141644239925 588.2104904497085 19.335550261139847


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book ['doc2vec_chunk_embedding'] None 464.6575337475425 16.945141644239925 588.2104904497085 19.335550261139847


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book ['average_sentence_embedding'] k_best_f_reg_0_10 438.52121243708564 16.427677752485703 572.3720862712792 19.081419648302624
lasso chunk_and_copied_book ['average_sentence_embedding'] ss_pca_0_95 378.2739337512271 15.27852429574759 457.5321225010277 16.930283845630548


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book ['average_sentence_embedding'] k_best_mutual_info_0_10 438.52121243708564 16.427677752485703 572.3720862712792 19.081419648302624


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book ['average_sentence_embedding'] None 438.52121243708564 16.427677752485703 572.3720862712792 19.081419648302624


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_f_reg_0_10 464.6575337475425 16.945141644239925 588.2104904497085 19.335550261139847
lasso chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] ss_pca_0_95 397.19757369405386 15.522226112076448 485.469918712033 17.353931245848695


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] k_best_mutual_info_0_10 464.6575337475425 16.945141644239925 588.2104904497085 19.335550261139847


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] None 464.6575337475425 16.945141644239925 588.2104904497085 19.335550261139847


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book [] k_best_f_reg_0_10 431.6173215446917 16.318818173183036 548.5884196927885 18.75738538388736
lasso chunk_and_copied_book [] ss_pca_0_95 353.68607170275607 14.739770769769189 435.8996844454042 16.588369583702438


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book [] k_best_mutual_info_0_10 442.40256417014746 16.516056454280882 574.8038596312177 19.098479290163446


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


lasso chunk_and_copied_book [] None 438.52121243708564 16.427677752485703 572.3720862712792 19.081419648302624
svr book ['doc2vec_chunk_embedding'] k_best_f_reg_0_10 458.0261962335647 16.586678536342863 474.7952090534176 16.965148186968047
svr book ['doc2vec_chunk_embedding'] ss_pca_0_95 437.56740928894004 15.966021681229716 463.75438009488164 16.722472351900976
svr book ['doc2vec_chunk_embedding'] k_best_mutual_info_0_10 456.7904229853475 16.486364275844643 474.05986986285853 16.99103071709024
svr book ['doc2vec_chunk_embedding'] None 450.6031018599671 16.237316082599925 474.76414420873914 16.919500339633576
svr book ['average_sentence_embedding'] k_best_f_reg_0_10 458.0261962335647 16.586678536342863 474.7952090534176 16.965148186968047
svr book ['average_sentence_embedding'] ss_pca_0_95 437.56740928894004 15.966021681229716 463.75438009488164 16.722472351900976
svr book ['average_sentence_embedding'] k_best_mutual_info_0_10 456.42160884955535 16.48865955683719 474.0663526018111 17.0

svr chunk_and_copied_book ['average_sentence_embedding', 'doc2vec_chunk_embedding'] None 474.08510915955446 16.85868951314611 478.0153359224422 16.9425222290663
svr chunk_and_copied_book [] k_best_f_reg_0_10 456.9766420638695 16.350639241099145 471.8973284399529 16.78464457051737
svr chunk_and_copied_book [] ss_pca_0_95 305.8788422633093 12.540192322002493 418.2733320816462 16.028029234670004
svr chunk_and_copied_book [] k_best_mutual_info_0_10 474.0846449002148 16.8588019315912 478.0101840125317 16.942527818931175
svr chunk_and_copied_book [] None 474.0837170943907 16.85881902839988 478.0093921353129 16.94253075119281


# Validation MAE medians

In [16]:
results_df = pd.DataFrame(results, columns=["model", "features", "drop_columns_including", "dimensionality_reduction", "train_mse", "train_mae", "validation_mse", "validation_mae"])
results_df["drop_columns_including"] = results_df["drop_columns_including"].apply(str)


In [17]:
results_df.groupby("model").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
model,Unnamed: 1_level_1
lasso,17.201753
svr,16.942537
xgboost,16.911822


In [18]:
results_df.groupby("features").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
features,Unnamed: 1_level_1
book,17.053378
book_and_averaged_chunk,16.957275
chunk,16.599657
chunk_and_copied_book,16.942537


In [19]:
results_df.groupby("drop_columns_including").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
drop_columns_including,Unnamed: 1_level_1
"['average_sentence_embedding', 'doc2vec_chunk_embedding']",16.961005
['average_sentence_embedding'],16.941678
['doc2vec_chunk_embedding'],16.948174
[],16.892265


In [20]:
results_df.groupby("dimensionality_reduction").agg({"validation_mae": "median"})

Unnamed: 0_level_0,validation_mae
dimensionality_reduction,Unnamed: 1_level_1
k_best_f_reg_0_10,16.965148
k_best_mutual_info_0_10,16.994652
ss_pca_0_95,16.610395


# Best result

In [21]:
results_df[results_df.validation_mae == results_df.validation_mae.min()]

Unnamed: 0,model,features,drop_columns_including,dimensionality_reduction,train_mse,train_mae,validation_mse,validation_mae
20,xgboost,chunk,['average_sentence_embedding'],k_best_f_reg_0_10,14.272714,2.773162,380.170237,15.384383
22,xgboost,chunk,['average_sentence_embedding'],k_best_mutual_info_0_10,14.272714,2.773162,380.170237,15.384383
23,xgboost,chunk,['average_sentence_embedding'],,14.272714,2.773162,380.170237,15.384383


In [25]:
results_df.to_csv("../data/results/model_features_drop_columns_including_dimensionality_reduction_ger.csv", index=False)
