In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import classification_report
from transformers import set_seed
import os
import sys
from tqdm import tqdm 

tqdm.pandas()


sys.path.append('src/')
from data.lambdas import int_to_label, label_to_int
set_seed(42)

# Define paths (adaptado do seu código existente)
raw_data_path = 'data/raw/'
processed_data_path = 'data/processed/'
reports_path = 'reports/'
file_format_tmt = processed_data_path + "{split}_r3_{target}_top_mentioned_timelines_processed.csv"
file_format_users = processed_data_path + 'r3_{target}_{split}_users_processed.csv'
file_format_users_scored = processed_data_path + 'r3_{target}_{split}_users_scored_Timeline.csv'
file_format_tmt_scored = processed_data_path + '{split}_r3_{target}_top_mentioned_timelines_scored_Texts.csv'

# Target list
target_list = ['lu']

dict_exps = {
    "Timeline": {
        'path_dataset': file_format_users,
        "text_col": "Timeline",
        "batch_size": 64,
        "epochs": 1000
    },
    "Texts": {
        'path_dataset': file_format_tmt,
        "text_col": "Texts",
        "batch_size": 64,
        "epochs": 1000
    },
}

check_if_already_exists = False

for exp_name, config in dict_exps.items():
    
    print(f"""###########################################
# Running: {exp_name} 
###########################################""")
    
    text_col = config['text_col']
    path_dataset = config['path_dataset']
    
    # Processar cada target
    for target in target_list:
        
        print(f"""######## target: {target}""")
        estimator_name = 'LSTM'
        test_results_path = f"{reports_path}test_results/{estimator_name}_{target}_{exp_name}_test_results.csv"
        train_results_path = f"{reports_path}train_results/{estimator_name}_{target}_{exp_name}_train_results.csv"
        val_results_path = f"{reports_path}val_results/{estimator_name}_{target}_{exp_name}_val_results.csv"
        
        if os.path.isfile(test_results_path) and os.path.isfile(train_results_path) and check_if_already_exists:
            print('# experiment already done')
            continue
        
        # Ler e dividir os dados
        train_val = pd.read_csv(
            path_dataset.format(target=target, split="train"), 
            sep=';', 
            encoding='utf-8-sig'
        ).reset_index()[[text_col, 'Polarity']].rename(columns={text_col: 'text', 'Polarity': 'label'})
        
        train_val.label = train_val.label.apply(lambda x: label_to_int(x))
        
        # train_val.text = train_val.text.progress_apply(lambda x: x[:13000])

        # Check if label is binary
        if len(train_val.label.unique()) != 2:
            raise Exception("There is an error in train_val label transformation: expected to be binary")
        
        train, val = train_test_split(train_val, test_size=0.15, random_state=42)
        train.reset_index(drop=True, inplace=True)
        val.reset_index(drop=True, inplace=True)
        
        test = pd.read_csv(
            path_dataset.format(target=target, split="test"), 
            sep=';', 
            encoding='utf-8-sig'
        ).reset_index()[[text_col, 'Polarity']].rename(columns={text_col: 'text', 'Polarity': 'label'})
        
        test.label = test.label.apply(lambda x: label_to_int(x))
        input_tokens = 1000
        # Tokenize and pad texts
        tokenizer = Tokenizer(num_words=input_tokens)
        tokenizer.fit_on_texts(train['text'])
        
        X_train = tokenizer.texts_to_sequences(train['text'])
        X_val = tokenizer.texts_to_sequences(val['text'])
        X_test = tokenizer.texts_to_sequences(test['text'])
        
        max_length = max([len(x) for x in X_train])
        X_train = pad_sequences(X_train, maxlen=max_length)
        X_val = pad_sequences(X_val, maxlen=max_length)
        X_test = pad_sequences(X_test, maxlen=max_length)
        
        y_train = np.array(train['label'])
        y_val = np.array(val['label'])
        y_test = np.array(test['label'])
        
        # Build LSTM model
        model = Sequential([
            Embedding(input_dim=input_tokens, output_dim=128, input_length=max_length),
            LSTM(64, return_sequences=True),
            LSTM(32),
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        
        # Train the model
        model.fit(X_train, y_train, epochs=config['epochs'], batch_size=config['batch_size'], validation_data=(X_val, y_val))
        
        # Evaluate the model
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)
        test_pred = model.predict(X_test)
        
        # Convert probabilities to binary predictions
        train_pred = (train_pred > 0.5).astype(int)
        val_pred = (val_pred > 0.5).astype(int)
        test_pred = (test_pred > 0.5).astype(int)

2024-08-25 14:37:44.037371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-25 14:37:44.054274: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-25 14:37:44.060502: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-25 14:37:44.075962: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


###########################################
# Running: Timeline 
###########################################
######## target: lu


I0000 00:00:1724607583.763284  887405 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724607583.767600  887405 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724607583.772592  887405 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1724607583.777042  887405 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

Epoch 1/1000


2024-08-25 14:39:45.080642: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 304798032 exceeds 10% of free system memory.
2024-08-25 14:39:58.072115: W external/local_tsl/tsl/framework/bfc_allocator.cc:482] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.68GiB (rounded to 1801519104)requested by op StatefulPartitionedCall/sequential_1/lstm_1/CudnnRNNV3
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-08-25 14:39:58.072146: I external/local_tsl/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-08-25 14:39:58.072155: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 36, Chunks in use: 36. 9.0KiB allocated for chunks. 9.0KiB in use in bin. 688B client-requested in use in bin.
2024-08-25 14:39:58.072160: I external/local_tsl/tsl/framew

ResourceExhaustedError: Graph execution error:

Detected at node sequential_1/lstm_1/CudnnRNNV3 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/asyncio/base_events.py", line 639, in run_forever

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/asyncio/base_events.py", line 1985, in _run_once

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 359, in execute_request

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 446, in do_execute

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_887405/1679446756.py", line 128, in <module>

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 108, in one_step_on_data

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 51, in train_step

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/layers/layer.py", line 901, in __call__

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/models/sequential.py", line 212, in call

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/models/functional.py", line 175, in call

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/models/functional.py", line 560, in call

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/layers/layer.py", line 901, in __call__

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/layers/rnn/lstm.py", line 570, in call

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/layers/rnn/rnn.py", line 406, in call

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/layers/rnn/lstm.py", line 537, in inner_loop

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/backend/tensorflow/rnn.py", line 841, in lstm

  File "/home/semcovici/anaconda3/envs/env-stance-pred/lib/python3.12/site-packages/keras/src/backend/tensorflow/rnn.py", line 933, in _cudnn_lstm

OOM when allocating tensor with shape[64,109956,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node sequential_1/lstm_1/CudnnRNNV3}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_one_step_on_iterator_3040]