In [1]:
%load_ext autoreload
%autoreload 2
language = "eng"

import sys
from copy import deepcopy
sys.path.insert(0, "../src/")
import os
import pandas as pd
get_ipython().run_line_magic("matplotlib", "inline")
import matplotlib.pyplot as plt
import time

from utils import get_labels
from cross_validation import Regression, TwoclassClassification, MulticlassClassification

features_dir = f"../data/features/{language}/"
results_dir = f"../data/results_sentiment/{language}/"
sentiment_labels_dir = "../data/labels_sentiment/"
canonization_labels_dir = "../data/labels_canon/"##################1############3

if not os.path.exists(results_dir):
    os.makedirs(results_dir)

In [2]:
# Labels statistics
# print(len(pd.unique(labels["book_name"]))) #197
# 254 labels, 197 different book_names -> 57 second/third... reviews
# 36 book_names with more than 1 label, these 36 book_names have 93 labels
# 93 = 36 first reviews + 57 second/third... reviews
# 6 texts have opposing reviews (13 reviews are opposing)
# 191 texts after aggregating (without opposing reviews)

#classification_labels["y"].plot.hist(grid=True, bins=50)

In [3]:
'''
Parameter combinations
'''

textblob_labels = get_labels(language, sentiment_labels_dir, canonization_labels_dir, 'textblob')
sentiart_labels = get_labels(language, sentiment_labels_dir, canonization_labels_dir, 'sentiart')
combined_labels = get_labels(language, sentiment_labels_dir, canonization_labels_dir, 'combined')
twoclass_labels = get_labels(language, sentiment_labels_dir, canonization_labels_dir, 'twoclass')
multiclass_labels = get_labels(language, sentiment_labels_dir, canonization_labels_dir, 'multiclass')
library_labels = get_labels(language, sentiment_labels_dir, canonization_labels_dir, 'library')
           
book_df = pd.read_csv(f"{features_dir}book_df.csv")
book_and_averaged_chunk_df = pd.read_csv(f"{features_dir}book_and_averaged_chunk_df.csv")
chunk_df = pd.read_csv(f"{features_dir}chunk_df.csv")
chunk_and_copied_book_df = pd.read_csv(f"{features_dir}chunk_and_copied_book_df.csv")

In [4]:
# All parameters
languages_list = ["eng", "ger"]
models_list = ["svr", "lasso", "xgboost", "svc"]
model_params_dict = {"svr": [1], "lasso": [1, 4], "xgboost": [None], "svc": [0.1, 1, 10, 100, 1000, 10000]} 
dimensionality_reduction_list = ["ss_pca_0_95", 'k_best_f_reg_0_10', 'k_best_mutual_info_0_10', None]
features_list = ["book", "chunk", "baac", "cacb"]
features_dict = {"book": book_df, "chunk": chunk_df, "baac": book_and_averaged_chunk_df, 
                 "cacb": chunk_and_copied_book_df}
labels_list = ['textblob', 'sentiart', 'combined', 'twoclass', 'multiclass', 'library']
labels_dict = {'textblob': textblob_labels, 'sentiart': sentiart_labels, 'combined': combined_labels, 
          'twoclass': twoclass_labels, 'multiclass': multiclass_labels, 'library': library_labels}


drop_columns_list = [
    ["average_sentence_embedding", "doc2vec_chunk_embedding"],
    ["average_sentence_embedding", "doc2vec_chunk_embedding", "pos"]]
if language == "eng":
    drop_columns_list.extend([
        ["average_sentence_embedding", "doc2vec_chunk_embedding", "->"], 
        ["average_sentence_embedding", "doc2vec_chunk_embedding", "->", "pos"]])
    
# Model-specific column names for writing results to file
general_cols = ['language', 'task_type', 'model', 'model_param', 'labels_string', 'features_string',
    'dimensionality_reduction', 'drop_columns']
regression_cols = general_cols + ["mean_train_mse", "mean_train_rmse", "mean_train_mae", "mean_train_r2", 
    "mean_train_corr", "mean_validation_mse", "mean_validation_rmse", "mean_validation_mae", 
    "mean_validation_r2", "mean_validation_corr", "mean_p_value"]
twoclass_cols = general_cols + ["mean_train_book_acc", "mean_validation_book_acc"] # also used for library
multiclass_cols = general_cols + ["mean_train_f1", "mean_validation_f1"]


# Link parameters to models
regression_dict = {
    "model": ["xgboost"], 
    "dimensionality_reduction": [None], 
    "features": features_list,
    "labels": ['textblob', 'sentiart', 'combined'],
    "drop_columns": drop_columns_list,
    "model_cols": regression_cols}
twoclass_dict = {
    "model": ["svc", "xgboost"], 
    "dimensionality_reduction": [None], 
    "features": ["book", "baac"],
    "labels": ['twoclass'],
    "drop_columns": drop_columns_list,
    "model_cols": twoclass_cols}
library_dict = deepcopy(twoclass_dict)
library_dict['labels'] = ['library']
multiclass_dict = {
    "model": ["svc", "xgboost"], 
    "dimensionality_reduction": [None],
    "features": ["book", "baac"],
    "labels": ['multiclass'],
    "drop_columns": drop_columns_list,
    "model_cols": multiclass_cols}

testing_reg_dict = {
    "model": ["xgboost"], 
    "dimensionality_reduction": [None], 
    "features": ["book"],
    "labels": ["combined"],
    "drop_columns": [drop_columns_list[-1]],
    "model_cols": regression_cols}
testing_twoclass_dict = {
    "model": ["xgboost", 'svc'], #xgboost
    "dimensionality_reduction": [None], 
    "features": ["book"], #"baac"
    "labels": ['twoclass'],
    "drop_columns": [drop_columns_list[-1]],
    "model_cols": twoclass_cols}
testing_multiclass_dict = {
    "model": ["xgboost", 'svc'], #xgboost
    "dimensionality_reduction": [None], 
    "features": ["book"], #"baac"
    "labels": ['multiclass'],
    "drop_columns": [drop_columns_list[-1]],
    "model_cols": multiclass_cols}
testing_library_dict = deepcopy(testing_twoclass_dict)
testing_library_dict['labels'] = ['library']

best_results_regression_eng_dict = {
    "model": ["xgboost"], 
    "dimensionality_reduction": [None], 
    "features": ["book"], #"baac"
    "labels": ['sentiart'],
    "drop_columns": [['average_sentence_embedding', 'doc2vec_chunk_embedding', 'pos']],
    "model_cols": regression_cols}

In [5]:
'''
Run Cross-Validation
'''  
task_type = "library" 
if task_type == "regression":
    param_dict = regression_dict
elif task_type == "library":
    param_dict = library_dict
elif task_type == "twoclass":
    param_dict = twoclass_dict
elif task_type == "multiclass":
    param_dict = multiclass_dict

# # Overwrite for testing ##########################################################
# if task_type == 'twoclass':
#     param_dict = testing_twoclass_dict
# elif task_type == "multiclass":
#     param_dict = testing_multiclass_dict
# elif task_type == 'library':
#     param_dict = testing_library_dict
if task_type == "regression":
    param_dict = best_results_regression_eng_dict

In [6]:
param_dict

{'model': ['svc', 'xgboost'],
 'dimensionality_reduction': [None],
 'features': ['book', 'baac'],
 'labels': ['library'],
 'drop_columns': [['average_sentence_embedding', 'doc2vec_chunk_embedding'],
  ['average_sentence_embedding', 'doc2vec_chunk_embedding', 'pos'],
  ['average_sentence_embedding', 'doc2vec_chunk_embedding', '->'],
  ['average_sentence_embedding', 'doc2vec_chunk_embedding', '->', 'pos']],
 'model_cols': ['language',
  'task_type',
  'model',
  'model_param',
  'labels_string',
  'features_string',
  'dimensionality_reduction',
  'drop_columns',
  'mean_train_book_acc',
  'mean_validation_book_acc']}

In [7]:
task_type

'library'

In [8]:
start = time.time()

In [None]:
results = []
with open(f"{results_dir}results-{language}-{task_type}-log.csv", 'a') as f:
    f.write("\t".join(param_dict['model_cols']) + '\n')
for model in param_dict['model']:
    model_param = model_params_dict[model]
    for model_param in model_param:
        for labels_string in param_dict['labels']:
            labels = deepcopy(labels_dict[labels_string])
            for features_string in param_dict["features"]:
                df = deepcopy(features_dict[features_string])
                for dimensionality_reduction in param_dict["dimensionality_reduction"]:
                    for drop_columns in param_dict["drop_columns"]:
                        print(language, task_type, model, model_param, labels_string, features_string,
                            dimensionality_reduction, drop_columns)
                        inner_start = time.time()
                        if task_type == 'regression':
                            experiment = Regression(
                                results_dir=results_dir,
                                language=language,
                                task_type=task_type,
                                model=model,
                                model_param=model_param,
                                labels_string=labels_string,
                                labels=labels,
                                features_string=features_string,
                                df=df,
                                dimensionality_reduction=dimensionality_reduction,
                                drop_columns=drop_columns,
                                verbose=True)

                        elif (task_type == 'twoclass') or (task_type == 'library'):
                            experiment = TwoclassClassification(
                                results_dir=results_dir,
                                language=language,
                                task_type=task_type,
                                model=model,
                                model_param=model_param,
                                labels_string=labels_string,
                                labels=labels,
                                features_string=features_string,
                                df=df,
                                dimensionality_reduction=dimensionality_reduction,
                                drop_columns=drop_columns,
                                verbose=True)

                        elif task_type == 'multiclass':
                            experiment = MulticlassClassification(
                                results_dir=results_dir,
                                language=language,
                                task_type=task_type,
                                model=model,
                                model_param=model_param,
                                labels_string=labels_string,
                                labels=labels,
                                features_string=features_string,
                                df=df,
                                dimensionality_reduction=dimensionality_reduction,
                                drop_columns=drop_columns,
                                verbose=True)

                        returned_values = experiment.run()
                        all_columns = [language, task_type, model, model_param, labels_string, features_string,
                                       dimensionality_reduction, drop_columns] + returned_values
                        print('all columns', all_columns)
                        
                        with open(f"{results_dir}results-{language}-{task_type}-log.csv", 'a') as f:
                            f.write("\t".join([str(x) for x in all_columns]) + '\n')
                            results.append(all_columns) 

                        print(language, task_type, model, model_param, labels_string, features_string,
                                dimensionality_reduction, drop_columns, returned_values)
                        print('\n-----------------------------------------------------------\n')
                        inner_end = time.time()
                        print(inner_end - inner_start)


results_df = pd.DataFrame(results, columns=param_dict['model_cols'])
results_df.to_csv(f"{results_dir}results-{language}-{task_type}-final.csv", index=False, sep='\t')

eng library svc 0.1 library book None ['average_sentence_embedding', 'doc2vec_chunk_embedding']
Dropped 0 columns.
rarest label is  1
labels per split

 1    87
0    34
Name: y, dtype: int64
labels per split

 1    94
0    36
Name: y, dtype: int64
labels per split

 1    87
0    30
Name: y, dtype: int64
labels per split

 1    95
0    22
Name: y, dtype: int64
labels per split

 1    94
0    24
Name: y, dtype: int64
--------------------------
Crosstab
 Predicted    0    1  All
True                    
0          117   29  146
1          253  204  457
All        370  233  603 
--------------------------
all columns ['eng', 'library', 'svc', 0.1, 'library', 'book', None, ['average_sentence_embedding', 'doc2vec_chunk_embedding'], 0.542, 0.532]
eng library svc 0.1 library book None ['average_sentence_embedding', 'doc2vec_chunk_embedding'] [0.542, 0.532]

-----------------------------------------------------------

2.52931547164917
eng library svc 0.1 library book None ['average_sentence_emb

labels per split

 1    87
0    34
Name: y, dtype: int64
labels per split

 1    94
0    36
Name: y, dtype: int64
labels per split

 1    87
0    30
Name: y, dtype: int64
labels per split

 1    95
0    22
Name: y, dtype: int64
labels per split

 1    94
0    24
Name: y, dtype: int64
--------------------------
Crosstab
 Predicted    0    1  All
True                    
0          117   29  146
1          258  199  457
All        375  228  603 
--------------------------
all columns ['eng', 'library', 'svc', 1, 'library', 'book', None, ['average_sentence_embedding', 'doc2vec_chunk_embedding', 'pos'], 0.541, 0.524]
eng library svc 1 library book None ['average_sentence_embedding', 'doc2vec_chunk_embedding', 'pos'] [0.541, 0.524]

-----------------------------------------------------------

2.526721954345703
eng library svc 1 library book None ['average_sentence_embedding', 'doc2vec_chunk_embedding', '->']
Dropped 0 columns.
rarest label is  1
labels per split

 1    87
0    34
Name: y, d

In [None]:
end = time.time()
print(end-start)

In [None]:
# Analyze results
#results_df = results_df.sort_values(by=["mean_validation_corr", "mean_p_value"])

In [None]:
#results_df = results_df[results_df["mean_p_value"]<=0.1].sort_values(by=["mean_validation_corr])