# Experiments 1 to 4: Analysis

In [1]:
import glob
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple, Union
from utils import df_to_latex, summarize_tables, sum_table_to_df

In [7]:
experiments_path = "../data/analysis/experiment_summary_1_to_4"
vectorization_methods = ["bow", "zscore", "tfidf", "cos"]
drop_not_tuned = True

## Prose corpus (only tuned classification methods)

In [8]:
prose_path = experiments_path + "/prose/all_classification_tables/"
prose_clf_tables = glob.glob(prose_path + "/*.csv")

In [9]:
prose_dict = {}

for vectorization_method in vectorization_methods:
    prose_dict[vectorization_method] = sum_table_to_df(summarize_tables(prose_clf_tables, 
                                                                        prose_path, 
                                                                        vectorization_method,
                                                                        drop_not_tuned = drop_not_tuned))

In [10]:
prose_bow_df = prose_dict["bow"]
prose_zscore_df = prose_dict["zscore"]
prose_tfidf_df = prose_dict["tfidf"]
prose_cos_df = prose_dict["cos"]

In [11]:
prose_bow_df

Unnamed: 0,200,300,500,1000,2000,3000
tKNN,0.731 (0.615),0.74 (0.641),0.767 (0.674),0.809 (0.69),0.833 (0.72),0.829 (0.738)
tNSC,0.529 (0.499),0.581 (0.521),0.62 (0.56),0.672 (0.578),0.731 (0.621),0.739 (0.652)
tMNB,0.899 (0.845),0.935 (0.878),0.941 (0.896),0.949 (0.913),0.966 (0.915),0.97 (0.926)
tLSVM,0.95 (0.883),0.956 (0.888),0.963 (0.899),0.963 (0.899),0.969 (0.895),0.96 (0.902)
tLR,0.968 (0.945),0.986 (0.955),0.979 (0.953),0.96 (0.9),0.938 (0.834),0.907 (0.793)


### Prose DataFrames to latex tables (remove comment for desired table)

In [16]:
#print(df_to_latex(prose_bow_df))
#print(df_to_latex(prose_zscore_df))
#print(df_to_latex(prose_tfidf_df))
#print(df_to_latex(prose_cos_df))

## Speeches corpus

In [17]:
speeches_path = experiments_path + "/speeches/all_classification_tables/"
speeches_clf_tables = glob.glob(speeches_path + "/*.csv")

In [18]:
speeches_dict = {}

for vectorization_method in vectorization_methods:
    speeches_dict[vectorization_method] = sum_table_to_df(summarize_tables(speeches_clf_tables, 
                                                                           speeches_path, 
                                                                           vectorization_method,
                                                                           drop_not_tuned = drop_not_tuned))

In [19]:
speeches_bow_df = speeches_dict["bow"]
speeches_zscore_df = speeches_dict["zscore"]
speeches_tfidf_df = speeches_dict["tfidf"]
speeches_cos_df = speeches_dict["cos"]

### Speeches DataFrames to latex tables (remove comment for desired table)

In [20]:
print(df_to_latex(speeches_bow_df))
#print(df_to_latex(speeches_zscore_df))
#print(df_to_latex(speeches_tfidf_df))
#print(df_to_latex(speeches_cos_df))

\small
\begin{tabular}{c|cccccc}
\hline
& \textbf{200} & \textbf{300} & \textbf{500} & \textbf{1000} & \textbf{2000} & \textbf{3000}\\\hline
\textbf{tKNN} & 0.385 (0.391) & 0.412 (0.401) & 0.377 (0.418) & 0.404 (0.441) & 0.492 (0.454) & 0.388 (0.44)\\
\textbf{tNSC} & 0.265 (0.239) & 0.265 (0.268) & 0.319 (0.263) & 0.292 (0.288) & 0.281 (0.307) & 0.285 (0.303)\\
\textbf{tMNB} & 0.631 (0.576) & 0.638 (0.629) & 0.688 (0.638) & 0.65 (0.677) & 0.658 (0.69) & 0.731 (0.685)\\
\textbf{tLSVM} & 0.55 (0.5) & 0.558 (0.519) & 0.581 (0.554) & 0.6 (0.562) & 0.612 (0.572) & 0.612 (0.575)\\
\textbf{tLR} & 0.542 (0.509) & 0.527 (0.502) & 0.565 (0.542) & 0.542 (0.537) & 0.562 (0.554) & 0.535 (0.538)\\
\end{tabular}
