## Setup

### imports

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline
from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import extract_features

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import sys
sys.path.append('../src/')
from models.classification_methods import process_classification 

### definitions

In [None]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'

In [None]:
list_target = ['ig','bo', 'cl', 'co', 'gl', 'lu']

In [None]:
corpus = 'ig'
model_name = 'facebook/fasttext-pt-vectors'
model_name = 'neuralmind/bert-base-portuguese-cased'

## Classification

### Top mentioned timelines

In [None]:
# create a list of tuples with (data_train, data_test, target)

list_tuples_top_ment = []

for target in tqdm(list_target):
    
    path_data_train = path_raw_data + f'train_r3_{target}_top_mentioned_timelines.csv'
    path_data_test = path_raw_data + f'test_r3_{target}_top_mentioned_timelines.csv'

    data_train = pd.read_csv(
        path_data_train, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    data_test = pd.read_csv(
        path_data_test, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    list_tuples_top_ment.append((data_train, data_test, target))

In [None]:
X_cols = 'Texts'

In [None]:
# define pipeline steps 
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=50000
            
            )
sampling = None
selection = None
scaling = MaxAbsScaler()
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        vectorizer = text_vect,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_top_ment,
        X_cols = X_cols
)

df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

### Users

In [None]:
# create a list of tuples with (data_train, data_test, target)

list_tuples_users = []

for target in tqdm(list_target):

    path_data_train = path_raw_data + f'r3_{target}_train_users.csv'
    path_data_test = path_raw_data + f'r3_{target}_test_users.csv'

    data_train = pd.read_csv(
        path_data_train, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    data_test = pd.read_csv(
        path_data_test, 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    list_tuples_users.append((data_train, data_test, target))

#### Timelines

In [None]:
X_cols = 'Timeline'

In [None]:
# define pipeline steps 
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=50000
            
            )
sampling = None
selection = None
scaling = MaxAbsScaler()
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        vectorizer= text_vect,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_users,
        X_cols = X_cols
)

df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')

#### Stance

In [None]:
X_cols = 'Stance'

In [None]:
# define pipeline steps 
text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3),
            max_features=50000
            
            )
sampling = None
selection = None
scaling = MaxAbsScaler()
estimator = XGBClassifier(
                random_state = 42,
                verbosity = 3,
                device = 'cuda',
                tree_method = 'hist'
                )


# get results
df_cr, df_test_results = process_classification(
        estimator = estimator,
        vectorizer= text_vect,
        scaling = scaling,
        selection= selection,
        data_tuples = list_tuples_users,
        X_cols = X_cols
)

df_cr[df_cr['class'] == 'macro avg'].sort_values('f1-score')