# Handcrafted_features_{lang}_multiple_experiments: The notebook with SVR + XGBoost and Lasso experiments.


In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, "../src/")
import numpy as np
import pandas as pd

extracted_features_dir = "../data/extracted_features"
labels_dir = "../data/labels/"
lang = "eng"

book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_df.csv")
book_and_averaged_chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/book_and_averaged_chunk_df.csv")
chunk_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_df.csv")
chunk_and_copied_book_df = pd.read_csv(f"{extracted_features_dir}/{lang}/chunk_and_copied_book_df.csv")

In [2]:
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor
from copy import deepcopy
from utils import read_labels, read_extreme_cases
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from collections import Counter
import random

labels = read_labels(labels_dir)
# canonization scores ground truths (extreme cases based on which canonoization scores were built)
extreme_cases_df = read_extreme_cases(labels_dir)

In [3]:
class Experiment(object):
    def __init__(self, features, drop_columns_including, dimensionality_reduction, include_data, model, verbose):
        assert features in ["book", "chunk", "book_and_averaged_chunk", "chunk_and_copied_book"]
        assert isinstance(drop_columns_including, list)
        for i in drop_columns_including:
            assert isinstance(i, str)
        assert model in ["xgboost", "svr", "lasso"]
        assert (dimensionality_reduction in ["ss_pca_0_95", "k_best_f_reg_0_10", "k_best_mutual_info_0_10"]) or (dimensionality_reduction is None)
        self.features = features
        self.labels = labels
        self.drop_columns_including = drop_columns_including
        self.include_data = include_data
        self.dimensionality_reduction = dimensionality_reduction
        self.model = model
        self.verbose = verbose

        if self.include_data != "train_reduced_test_reduced":
            if self.features == "book":
                self.df = deepcopy(book_df)
            elif self.features == "chunk":
                self.df = deepcopy(chunk_df)
            elif self.features == "chunk_and_copied_book":
                self.df = deepcopy(chunk_and_copied_book_df)
            elif self.features == "book_and_averaged_chunk":
                self.df = deepcopy(book_and_averaged_chunk_df)
        else:
            if self.features == "book":
                curr_book_df = deepcopy(book_df)
                self.df = curr_book_df.merge(extreme_cases_df, how='inner', left_on='book_name', right_on="file_name", validate="one_to_one").drop(columns=["file_name"])
            elif features == "chunk":
                curr_chunk_df = deepcopy(chunk_df)
                self.df = curr_chunk_df.merge(extreme_cases_df, how='inner', left_on='book_name', right_on="file_name", validate="many_to_one").drop(columns=["file_name"])
            elif features == "chunk_and_copied_book":
                curr_chunk_and_copied_book_df = deepcopy(chunk_and_copied_book_df)
                self.df = curr_chunk_and_copied_book_df.merge(extreme_cases_df, how='inner', left_on='book_name', right_on="file_name", validate="many_to_one").drop(columns=["file_name"])
            elif features == "book_and_averaged_chunk":
                curr_book_and_averaged_chunk_df = deepcopy(book_and_averaged_chunk_df)
                self.df = curr_book_and_averaged_chunk_df.merge(extreme_cases_df, how='inner', left_on='book_name', right_on="file_name", validate="one_to_one").drop(columns=["file_name"])

        columns_before_drop = set(self.df.columns)
        self.df = self.df[[column for column in self.df.columns if not self._drop_column(column)]].reset_index(drop=True)
        columns_after_drop = set(self.df.columns)
        if self.verbose:
            print(f"Dropped {len(columns_before_drop - columns_after_drop)} columns.")
        self.df.loc[:, "y"] = self.df.book_name.apply(lambda x: self.labels[x]).tolist()

    def _drop_column(self, column):
        for string in self.drop_columns_including:
            if string in column:
                return True
        return False
    
    def _custom_pca(self, train_X):
        for i in range(5, train_X.shape[1], int((train_X.shape[1] - 5) / 10)):
            pca = PCA(n_components=i)
            new_train_X = pca.fit_transform(train_X)
            if pca.explained_variance_ratio_.sum() >= 0.95:
                break
        return new_train_X, pca

    def _select_features(self, train_X, train_y, validation_X):
        if self.dimensionality_reduction == "ss_pca_0_95":
            ss = StandardScaler()
            train_X = ss.fit_transform(train_X)
            validation_X = ss.transform(validation_X)
            train_X, pca = self._custom_pca(train_X)
            validation_X = pca.transform(validation_X)
        elif self.dimensionality_reduction == "k_best_f_reg_0_10":
            k_best = SelectKBest(f_regression, k=np.minimum(int(0.10 * train_X.shape[0]), train_X.shape[1]))
            train_X = k_best.fit_transform(train_X, train_y)
            validation_X = k_best.transform(validation_X)
        elif self.dimensionality_reduction == "k_best_mutual_info_0_10":
            k_best = SelectKBest(mutual_info_regression, k=np.minimum(int(0.10 * train_X.shape[0]), train_X.shape[1]))
            train_X = k_best.fit_transform(train_X, train_y)
            validation_X = k_best.transform(validation_X)
        elif self.dimensionality_reduction is None:
            pass
        return train_X, validation_X
    
    def _impute(self, train_X, validation_X):
        imputer = KNNImputer()
        train_X = imputer.fit_transform(train_X)
        validation_X = imputer.transform(validation_X)
        return train_X, validation_X
    
    def _get_model(self):
        # if any of these performs better than others, we can try to tune the hyperparameters
        # but I think for now it's more important to see which approach performs better
        # chunk based or doc based
        # use dimensionality reduction or not...
        if self.model == "xgboost":
            return XGBRegressor(n_estimators=1000, max_depth=4, learning_rate=0.01, colsample_bytree=0.33, min_child_weight=6)
        elif self.model == "svr":
            return SVR()
        elif self.model == "lasso":
            return Lasso()
        
    def _split_booknames(self, df, nr_splits):
        '''
        Distribute book names over splits.
        All works of an author are in the same split.
        '''
        book_names = df['book_name'].unique()
        authors = []
        booknames_authors_mapping = {}

        #Get authors
        for book_name in book_names:
            author = '_'.join(book_name.split('_')[:2])
            authors.append(author)
            if author in booknames_authors_mapping:
                booknames_authors_mapping[author].append(book_name)
            else:
                booknames_authors_mapping[author] = []
                booknames_authors_mapping[author].append(book_name)
        #Distribute authors over splits so that each split has approximately the same number of books
        works_per_author = Counter(authors)
        goal_sum = round(len(book_names)/nr_splits)
        tolerance = 0.03
        lower_threshold = goal_sum - round(tolerance*goal_sum)
        upper_threshold = goal_sum + round(tolerance*goal_sum)
        author_splits = []
        for i in range (0,nr_splits-1):
            works_in_split = 0
            split = []
            while works_in_split < upper_threshold:
                curr_author = random.choice(list(works_per_author.keys()))
                curr_author_workcount = works_per_author.pop(curr_author)
                #Reinsert into dict if value is too high
                if works_in_split + curr_author_workcount > upper_threshold:
                    works_per_author[curr_author] = curr_author_workcount
                else:
                    split.append(curr_author)
                    works_in_split += curr_author_workcount
                    if works_in_split >= lower_threshold:
                        break
            author_splits.append(split)
        #Create last split directly from remaining dict
        works_in_last_split = sum(works_per_author.values())
        split = list(works_per_author.keys())
        author_splits.append(split)

        #Map author splits to book names
        book_splits = []
        for author_split in author_splits:
            book_split = []
            for author in author_split:
                book_split.extend(booknames_authors_mapping[author])
            book_splits.append(book_split)

        return book_splits

    def run(self):
        all_predictions = []
        all_labels = []
        train_mses = []
        train_maes = []
        validation_mses = []
        validation_maes = []
        validation_r2s = []

        df = self.df
        book_names_split = self._split_booknames(df=df, nr_splits=10)
        #book_names = df['book_name'].unique()
        #book_names_split = np.array_split(book_names, 10)
        for index, split in enumerate(book_names_split):
            print(index, len(split))
            train_df = df[~df["book_name"].isin(split)]
            validation_df = df[df["book_name"].isin(split)]
            #print(validation_df.head)
            if self.include_data == "train_full_test_reduced":
                #print(validation_df.shape)
                validation_df = validation_df.merge(extreme_cases_df, how='inner', left_on="book_name", right_on="file_name").drop(columns=["file_name"])
                #print(validation_df.shape)
            train_X = train_df.drop(columns=["y", "book_name"]).values
            train_y = train_df["y"].values.ravel()
            validation_X = validation_df.drop(columns=["y", "book_name"]).values
            validation_y = validation_df["y"].values.ravel()
            train_X, validation_X = self._impute(train_X, validation_X)
            #if self.verbose:
            #    print(f"train_X.shape before {self.dimensionality_reduction}: {train_X.shape}, validation_X.shape before {self.dimensionality_reduction}: {validation_X.shape}")
            train_X, validation_X = self._select_features(train_X, train_y, validation_X)
            #if self.verbose:
            #    print(f"train_X.shape after {self.dimensionality_reduction}: {train_X.shape}, validation_X.shape after {self.dimensionality_reduction}: {validation_X.shape}")
            model = self._get_model()
            model.fit(train_X, train_y)
            
            train_books = deepcopy(train_df[["book_name", "y"]])
            train_books["yhat"] = model.predict(train_X)
            validation_books = deepcopy(validation_df[["book_name", "y"]])
            validation_books["yhat"] = model.predict(validation_X)
            
            train_books = train_books.groupby("book_name").mean()
            validation_books = validation_books.groupby("book_name").mean()
            
            train_y = train_books["y"].tolist()
            train_yhat = train_books["yhat"].tolist()
            validation_y = validation_books["y"].tolist()
            validation_yhat = validation_books["yhat"].tolist()
            
            all_labels.extend(validation_y)
            all_predictions.extend(validation_yhat)
            
            train_mse = mean_squared_error(train_y, train_yhat)
            train_mae = mean_absolute_error(train_y, train_yhat)
            validation_mse = mean_squared_error(validation_y, validation_yhat)
            validation_mae = mean_absolute_error(validation_y, validation_yhat)
            validation_r2 = r2_score(validation_y, validation_yhat)
            train_mses.append(train_mse)
            train_maes.append(train_mae)
            validation_mses.append(validation_mse)
            validation_maes.append(validation_mae)
            validation_r2s.append(validation_r2)
            if self.verbose:
                print(f"Fold: {index+1}, TrainMSE: {np.round(train_mse, 3)}, TrainMAE: {np.round(train_mae, 3)}, ValMSE: {np.round(validation_mse, 3)}, ValMAE: {np.round(validation_mae, 3)}, ValR2: {np.round(validation_r2, 3)}")
        all_labels = np.array(all_labels)
        all_predictions = np.array(all_predictions)

        mean_train_mse = np.mean(train_mses)
        mean_train_mae = np.mean(train_maes)
        mean_validation_mse = np.mean(validation_mses)
        mean_validation_rmse = np.mean([sqrt(x) for x in validation_mses])
        mean_validation_mae = np.mean(validation_maes)
        mean_validation_r2 = np.mean(validation_r2s)
        
        if self.verbose:
            print("------")
            print(f"Mean scores, TrainMSE: {np.round(mean_train_mse, 3)}, TrainMAE: {np.round(mean_train_mae, 3)}, ValMSE: {np.round(mean_validation_mse, 3)}, ValRMSE: {np.round(mean_validation_rmse, 3)}, ValMAE: {np.round(mean_validation_mae, 3)}, ValR2: {np.round(mean_validation_r2, 3)}")

            plt.figure(figsize=(8, 8))
            plt.xticks(fontsize=20)
            plt.yticks(fontsize=20)
            plt.scatter(all_labels, all_predictions)
            plt.xlabel("Canonization Scores", fontsize=20)
            plt.ylabel("Predicted Scores", fontsize=20)
            plt.savefig("../data/results/figures/" + lang + '-' + self.model + '-' + self.dimensionality_reduction + '-' + self.features + '-' + self.include_data + '-' + 'author_split' + '.png', dpi=400)
            plt.show();
        return mean_train_mse, mean_train_mae, mean_validation_mse, mean_validation_rmse, mean_validation_mae, mean_validation_r2

In [4]:
results = []

for model in ["svr"]: #"xgboost", "lasso", 
    for features in ["book_and_averaged_chunk"]: #"book", "chunk","chunk_and_copied_book"
        for drop_columns_including in [["average_sentence_embedding"]]:# ["doc2vec_chunk_embedding"], ["average_sentence_embedding", "doc2vec_chunk_embedding"], []
            for dimensionality_reduction in ["ss_pca_0_95"]: #"k_best_f_reg_0_10", "k_best_mutual_info_0_10", None
                # train and test with either full dataset or only extreme cases
                for include_data in ["full"]:#, "train_full_test_reduced", "train_reduced_test_reduced"]:
                    print('Data included:', include_data)
                    try:
                        experiment = Experiment(
                            features=features,
                            drop_columns_including=drop_columns_including,
                            dimensionality_reduction=dimensionality_reduction,
                            include_data=include_data,
                            model=model,
                            verbose=False
                        )
                        train_mse, train_mae, validation_mse, validation_mae = experiment.run()
                        results.append((model, features, drop_columns_including, dimensionality_reduction, train_mse, train_mae, validation_mse, validation_mae))
                        print(model, features, drop_columns_including, dimensionality_reduction, train_mse, train_mae, validation_mse, validation_mae)
                    except Exception as e:
                        print(f"Error in {model}, {features}, {drop_columns_including}, {dimensionality_reduction}")
                        print(e)

Data included: full
Error in svr, book_and_averaged_chunk, ['average_sentence_embedding'], ss_pca_0_95
'Conrad_Joseph_The-Secret-Sharer_1895'


# Validation MAE medians

In [5]:
results_df = pd.DataFrame(results, columns=["model", "features", "drop_columns_including", "dimensionality_reduction", "train_mse", "train_mae", "validation_mse", "validation_mae"])
results_df["drop_columns_including"] = results_df["drop_columns_including"].apply(str)

In [6]:
results_df.groupby("model").agg({"validation_mae": "median"})

DataError: No numeric types to aggregate

In [None]:
results_df.groupby("features").agg({"validation_mae": "median"})

In [None]:
results_df.groupby("drop_columns_including").agg({"validation_mae": "median"})

In [None]:
results_df.groupby("dimensionality_reduction").agg({"validation_mae": "median"})

# Best result

In [None]:
results_df[results_df.validation_mae == results_df.validation_mae.min()]

In [None]:
results_df.to_csv("../data/results/model_features_drop_columns_including_dimensionality_reduction_eng.csv", index=False)
