## Setup

### Imports

In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.dummy import DummyClassifier
from joblib import Parallel, delayed
from joblib_progress import joblib_progress
from datetime import datetime
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features
tqdm.pandas()
import warnings
warnings.filterwarnings('ignore') 


import sys
sys.path.append('../src/')
from models.classification_methods import process_classification 

### definitions

In [2]:
model_name = 'neuralmind/bert-base-portuguese-cased'

In [3]:
random_seed = 42

In [4]:
raw_data_path = '../data/raw/'
processed_data_path = '../data/processed/'
results_cr_path = '../reports/classification_reports/'
test_results_path = '../reports/test_results/'

In [5]:
target_list = ['ig','bo', 'cl', 'co', 'gl', 'lu']

## Classification

In [6]:
def read_pandas(
    path,
    file_type = 'csv',
    read_data_args = {}
):
    
    match file_type:
        case 'csv':
            data = pd.read_csv(path, **read_data_args)
        case 'parquet':
            data =pd.read_parquet(path, **read_data_args)
            
    return data

def create_train_test_tuples(
    list_train_paths,
    list_test_paths,
    target_list,
    n_jobs = -1, 
    file_type = 'csv',
    read_data_args = {}
):
    
    if (len(list_train_paths) != len(list_test_paths)) or (len(list_train_paths) != len(target_list)):
        
        raise Exception('The lists are not the same length')
    
    len_data_paths = len(list_train_paths)
    data_paths = zip(list_train_paths, list_test_paths,target_list)
    
    func_read_data = lambda a,b,c: (read_pandas(a,file_type = file_type,read_data_args = read_data_args),read_pandas(b,file_type = file_type, read_data_args = read_data_args),c)
    
    with joblib_progress('Reading data ...', total =len_data_paths):
        
        parallel  = Parallel(n_jobs=n_jobs)
        list_tuples = parallel(delayed(func_read_data)(a,b,c) for a,b,c in data_paths)
        
    return list_tuples

def generate_results( 
        data_tuples_list,
        corpus_name, 
        X_col,
        clf,
        reports_path = '../reports/',
        estimator_name = None
):
        
        if estimator_name is None:
                estimator_name = clf.get('estimator').__class__.__name__
                
        # get results
        results_cr_path = f'{reports_path}classification_reports/'
        test_results_path = f'{reports_path}test_results/'

        df_cr, df_test_results = process_classification(
                **clf,
                data_tuples = data_tuples_list,
                X_cols = X_col
        )
        
        if 'emb' in X_col[0]:
            str_cols = 'emb'
        else:
            str_cols = '_'.join(X_col)

        df_cr.to_csv(results_cr_path + f'{estimator_name}_{corpus_name}_{str_cols}_classification_report.csv')
        df_test_results.to_csv(test_results_path + f'{estimator_name}_{corpus_name}_{str_cols}_test_results.csv')
        
        return df_cr, df_test_results

In [7]:
top_ment_time_path = raw_data_path + '{}_r3_{}_top_mentioned_timelines.csv'
list_train_paths_tmt = [top_ment_time_path.format("train",t) for t in target_list]
list_test_paths_tmt = [top_ment_time_path.format("test",t) for t in target_list]

In [8]:
users_path = raw_data_path + 'r3_{}_{}_users.csv'
list_train_paths_users = [users_path.format(t,"train") for t in target_list]
list_test_paths_users = [users_path.format(t,"test") for t in target_list]

In [9]:
model_name = 'neuralmind/bert-base-portuguese-cased'

top_ment_time_emb_path = processed_data_path + '{}_r3_{}_top_mentioned_timelines_{}.parquet'
list_train_paths_tmt_emb = [top_ment_time_emb_path.format("train",t, model_name.replace("/", "_")) for t in target_list]
list_test_paths_tmt_emb = [top_ment_time_emb_path.format("test",t, model_name.replace("/", "_")) for t in target_list]


users_emb_path = processed_data_path + 'r3_{}_{}_users_{}.parquet'
list_train_paths_users_emb = [users_emb_path.format(t,"train", model_name.replace("/", "_")) for t in target_list]
list_test_paths_users_emb = [users_emb_path.format(t,"test", model_name.replace("/", "_")) for t in target_list]

In [10]:
clf_to_test = {
    'dummy': {
        'estimator': DummyClassifier()
    },
    'tfidf_xgb':{
        'preprocessing': TfidfVectorizer(
                    stop_words = stopwords.words('portuguese'),
                    lowercase = True,
                    # ngram_range = (1,3),
                    # max_features=50
                    
                    ),
        'scaling': MaxAbsScaler(),
        'estimator':  XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )
    }
}

clf_to_test_emb = {
    'bertimbau_xgb':{
        'scaling': MaxAbsScaler(),
        'estimator':  XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )
    }
}


# X_cols_comb: possible combinations of X_col
config_experiments_dict = {
    'top_mentioned_timelines':{
        'list_train_paths': list_train_paths_tmt,
        'list_test_paths' : list_test_paths_tmt,
        'file_type': 'csv',
        'read_data_args' : {'sep': ';', 'encoding': 'utf-8-sig'},
        'X_cols_comb': [['Texts']],
        'clf_to_test': clf_to_test
    },
    'users':{
        'list_train_paths': list_train_paths_users,
        'list_test_paths' : list_test_paths_users,
        'file_type': 'csv',
        'read_data_args' : {'sep': ';', 'encoding': 'utf-8-sig'},
        'X_cols_comb': [['Timeline'], ['Stance']],
        'clf_to_test': clf_to_test
    },
    'users_emb':{
        'list_train_paths': list_train_paths_users_emb,
        'list_test_paths' : list_test_paths_users_emb,
        'file_type': 'parquet',
        'read_data_args' : {},
        'X_cols_comb': [
            [f'Timeline_emb_{i + 1}' for i in range(768)], 
            [f'Stance_emb_{i + 1}' for i in range(768)]
            ],
        'clf_to_test': clf_to_test_emb
    },
    'top_mentioned_timelines_emb':{
        'list_train_paths': list_train_paths_tmt_emb,
        'list_test_paths' : list_test_paths_tmt_emb,
        'file_type': 'parquet',
        'read_data_args' : {},
        'X_cols_comb': [
            [f'Texts_emb_{i + 1}' for i in range(768)]
            ],
        'clf_to_test': clf_to_test_emb
    }
}

In [11]:
for corpus,config in config_experiments_dict.items():
    
    print(f'##### Start of {corpus} - {datetime.today()} #####')

    data_tuples_list = create_train_test_tuples(
        list_train_paths = config.get('list_train_paths'),
        list_test_paths = config.get('list_test_paths'),
        target_list = target_list,
        file_type = config.get('file_type'),
        read_data_args= config.get('read_data_args')
    )
    
    for X_col in config.get('X_cols_comb'):
        
        for clf_name, clf in config.get('clf_to_test').items():
            
            print(f'- Running combination {X_col}')
        
            generate_results(       
                    data_tuples_list = data_tuples_list,
                    corpus_name = corpus, 
                    X_col =X_col,
                    clf = clf,
                    reports_path = '../reports/',
                    estimator_name = clf_name
            )
            
        print(f'##### End of {corpus} - {datetime.today()} #####\n\n\n\n')

Output()

##### Start of top_mentioned_timelines - 2024-04-21 19:30:06.682840 #####


- Running combination ['Texts']
Training ...
[Pipeline] ..... (step 1 of 5) Processing preprocessing, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Training ...
[Pipeline] ..... (step 1 of 5) Processing preprocessing, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Training ...
[Pipeline] ..... (step 1 of 5) Processing preprocessing, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipel

Output()

##### End of top_mentioned_timelines - 2024-04-21 19:35:39.474175 #####




##### Start of users - 2024-04-21 19:35:39.474231 #####


- Running combination ['Timeline']
Training ...
[Pipeline] ..... (step 1 of 5) Processing preprocessing, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Training ...
[Pipeline] ..... (step 1 of 5) Processing preprocessing, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Training ...
[Pipeline] ..... (step 1 of 5) Processing preprocessing, total=   0.0s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pi