In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from ast import literal_eval
from tqdm import tqdm
from sklearn.metrics import classification_report
tqdm.pandas()
import random
random.seed(0)
from belt_nlp.bert_with_pooling import BertClassifierWithPooling

In [2]:
raw_data_path = '../data/raw/'
processed_data_path = '../data/processed/'
reports_path = '../reports/'
file_format_users_filtered = processed_data_path + 'r3_{target}_{split}_users_scored_Timeline.csv' 
file_format_tmt_filtered = processed_data_path + '{split}_r3_{target}_top_mentioned_timelines_scored_Texts.csv'

In [3]:
target_list = [
    'ig',
    'bo', 
    'cl', 
    'co', 
    'gl', 
    'lu'
    ]

In [4]:
dict_experiments = {
    # 'filtered_Texts5': {
    #     "text_col": 'Texts',
    #     "n_comments": 5,
    #     "file_format": file_format_tmt_filtered
    # },
    # 'filteredTimeline5': {
    #     "text_col": 'Timeline',
    #     "n_comments": 5,
    #     "file_format": file_format_users_filtered
    # },
    # 'filtered_Texts10': {
    #     "text_col": 'Texts',
    #     "n_comments": 10,
    #     "file_format": file_format_tmt_filtered
    # },
    # 'filteredTimeline10': {
    #     "text_col": 'Timeline',
    #     "n_comments": 10,
    #     "file_format": file_format_users_filtered
    # },
    # 'filtered_Texts15': {
    #     "text_col": 'Texts',
    #     "n_comments": 15,
    #     "file_format": file_format_tmt_filtered
    # },
    # 'filteredTimeline15': {
    #     "text_col": 'Timeline',
    #     "n_comments": 15,
    #     "file_format": file_format_users_filtered
    # },
    'filtered_Texts100': {
        "text_col": 'Texts',
        "n_comments": 100,
        "file_format": file_format_tmt_filtered
    },
    'filteredTimeline400': {
        "text_col": 'Timeline',
        "n_comments": 100,
        "file_format": file_format_users_filtered
    },
}

# Example - Model BERT with pooling

In this notebook we will show how to use basic methods `fit` and `predict` for the BERT model with pooling.

In [5]:
!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [6]:
bert_model_name = 'pablocosta/bertabaporu-base-uncased'
epochs = 3
batch_size = 3

MODEL_PARAMS = {
    "batch_size": batch_size,
    "learning_rate": 5e-5,
    "epochs": epochs,
    "chunk_size": 510,
    "stride": 510,
    "minimal_chunk_length": 510,
    "pooling_strategy": "mean",
    "pretrained_model_name_or_path": bert_model_name
}

In [7]:
bert_model_name.replace('/','_')

'pablocosta_bertabaporu-base-uncased'

In [8]:
for exp_name, config in dict_experiments.items():
    
    print(f"""####################################  
# Running {exp_name}
#####################################""")
    
    
    # get configs of experiments
    text_col = config['text_col']
    file_format = config['file_format']
    n_comments = config['n_comments']
    file_format = config['file_format']
    
    
    list_responses = []
    for target in target_list:
        
        print(target)
        
        # read data
        train = pd.read_csv(
            file_format.format(target = target, split = "train"), 
            sep = ';', 
            encoding='utf-8-sig'
            )

        train[f'comments_and_scores_{text_col}'] = train[f'comments_and_scores_{text_col}'].progress_apply(lambda x: literal_eval(x))

        train[text_col] = train[f'comments_and_scores_{text_col}'].progress_apply(
            lambda x: " # ".join([comment for score, comment in x[-n_comments:]])
            ) 
        train.Polarity = train.Polarity.map({
            "against": False,
            "for": True
        })
        test = pd.read_csv(
            file_format.format(target = target, split = "test"), 
            sep = ';', 
            encoding='utf-8-sig'
            )

        test[f'comments_and_scores_{text_col}'] = test[f'comments_and_scores_{text_col}'].progress_apply(lambda x: literal_eval(x))

        test[text_col] = test[f'comments_and_scores_{text_col}'].progress_apply(
            lambda x: " # ".join([comment for score, comment in x[-n_comments:]])
            ) 

        test.Polarity = test.Polarity.map({
            "against": False,
            "for": True
        })
        
        
        X_train = train[text_col].tolist()
        X_test = test[text_col].tolist()

        y_train = train["Polarity"].tolist()
        y_test = test["Polarity"].tolist()        
        
        torch.cuda.empty_cache()
        
        model = BertClassifierWithPooling(**MODEL_PARAMS, device="cuda:0")
        
        model.fit(X_train, y_train, epochs=epochs)
        
        y_pred = model.predict_classes(X_test)
        
        del model
        
        df_responses = pd.DataFrame({
            "y_test": y_test,
            "y_pred": y_pred
        })
        
        df_responses['target'] =target
        df_responses['exp_name'] = exp_name
        df_responses['n_comments'] = n_comments
        df_responses['text_col'] = text_col
        
        print(classification_report(y_test, y_pred))
        df_responses.to_csv(f'{reports_path}test_results/belt_{exp_name}_{bert_model_name.replace('/','_')}_test_results_part_{target}.csv')  
        
        list_responses.append(df_responses)
        
df_responses_final = pd.concat(list_responses)
df_responses_final.to_csv(f'{reports_path}test_results/belt_{exp_name}_{bert_model_name.replace('/','_')}_test_results.csv')


####################################  
# Running filtered_Texts100
#####################################
ig


100%|██████████| 1758/1758 [00:57<00:00, 30.61it/s]
100%|██████████| 1758/1758 [00:00<00:00, 29707.65it/s]
100%|██████████| 599/599 [00:20<00:00, 29.57it/s]
100%|██████████| 599/599 [00:00<00:00, 34897.67it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 5.79 GiB of which 4.12 MiB is free. Including non-PyTorch memory, this process has 5.73 GiB memory in use. Of the allocated memory 5.58 GiB is allocated by PyTorch, and 62.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
df_responses_final['hit'] = df_responses_final.apply(lambda x: 1 if x.y_test == x.y_pred else 0,axis = 1)
df_responses_final

In [None]:
df_responses_final

In [None]:
df_responses_final.groupby('n_comments').mean('hit')