In [None]:
# from google.colab import drive
# drive.mount('/content/drive',force_remount=True)


In [None]:
# %cd drive/My\ Drive/SIC\ -\ BigData/8.\ Project/

In [None]:
# %ls

## Import packages

In [None]:
# !pip install evaluate
# !pip install -U accelerate
!pip install datasets
# !pip install lxml
# !pip install -U pandas
# !pip install numpy
# !pip install transformers
# !pip install torch
# !pip install --upgrade pyarrow
# !pip install scikit-learn
# !pip install tensorboardX
# !pip install seaborn
# !pip install spacy
# !pip install xgboost
# !pip install beautifulsoup4

In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, AdamW, AutoModel, AutoModelForSequenceClassification, get_linear_schedule_with_warmup, AutoTokenizer
from transformers import AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch import nn
import spacy
import re
import string
import string
from bs4 import BeautifulSoup
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = STOP_WORDS
punctuations = string.punctuation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## Prepare DataSet

In [None]:
model_name = "ProsusAI/finbert"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)


ds = load_dataset('takala/financial_phrasebank', 'sentences_50agree',trust_remote_code=True)
df = ds['train'].to_pandas()
# df = df.drop_duplicates(subset='sentence')
df.rename(columns={'sentence':'text'},inplace=True)

# 0 = negative
# 1 = neutral
# 2 = positive

In [None]:
df

Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2
...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,0
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,1
4843,Operating profit fell to EUR 35.4 mn from EUR ...,0
4844,Net sales of the Paper segment decreased to EU...,0


In [None]:
# Load dataset
temp_df, test_df = train_test_split(df, test_size=0.20, stratify=df['label'], random_state=42)
train_df, val_df = train_test_split(temp_df, test_size=0.20, stratify=temp_df['label'], random_state=42)

In [None]:
# Chuyển đổi ngược lại từ DataFrame sang Dataset
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

In [None]:
dataset = DatasetDict()
dataset['train'] = train_dataset
dataset['val'] = val_dataset
dataset ['test'] = test_dataset

## Preprocessing DataSet

#### A: Clean 'text' columns
- python -m spacy download en_core_web_sm

**Creating our tokenizer function - lemmatized, lowercased; remove "-PRON"s, stop_words, punctuations**

In [None]:
# nlp = English()
nlp = spacy.load('en_core_web_sm')
def clean_text(sentence):
    # # Parse HTML and extract plain text
    soup = BeautifulSoup(sentence+'<html>', "html.parser")
    sentence = re.sub(r'\[[^]]*\]', '', soup.get_text())

    # # # Remove links
    sentence = re.sub(r'http\S+|www\S+|https\S+', '', sentence, flags=re.MULTILINE)

    # # Remove words with invalid characters (keeping alphanumeric and %,'.')
    # sentence = ' '.join(word for word in sentence.split() if re.match(r'^[a-zA-Z0-9%.,:]+$', word))

    # # Process text with spaCy
    # doc = nlp(sentence)
    # mytokens = [word.lemma_.strip() if word.lemma_ != "-PRON-" else word for word in doc]

    # # Remove stop words and punctuation
    # mytokens = [word.text for word in doc if word.text.lower() not in stop_words and word.text not in punctuations]

    # Return cleaned text
    # return ' '.join(mytokens).strip()
    return sentence



In [None]:
print(clean_text('$QCOM chart: Qualcomm Short 61.8% Retracement, Trendline and Resistance. https://t.co/2WQqh35KOY'))

$QCOM chart: Qualcomm Short 61.8% Retracement, Trendline and Resistance. 


**Clean_text**

In [None]:
def preprocess_function(examples):
    examples['text'] = [clean_text(text) for text in examples['text']]
    return examples
cleaned_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

Map:   0%|          | 0/776 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

In [None]:
# Preprocess function
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=64)

encoded_dataset = cleaned_dataset.map(tokenize, batched=True,batch_size=None)

Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

Map:   0%|          | 0/776 [00:00<?, ? examples/s]

Map:   0%|          | 0/970 [00:00<?, ? examples/s]

In [None]:
encoded_dataset = encoded_dataset.remove_columns('text')

In [None]:
encoded_dataset['test']

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 970
})

## Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)

**Define Trainer parameters**

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy="epoch",
    disable_tqdm = False,

    greater_is_better=False,
    load_best_model_at_end = True,
    metric_for_best_model='eval_loss',

    # fp16=True,
    # overwrite_output_dir=True,
    # seed=42,
    # local_rank=-1,
    # gradient_accumulation_steps=1,
    # warmup_steps=500,
)

# Compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['val'],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=.0)],
    optimizers= (torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-5, betas=(0.9, 0.999)), None),
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.426852,0.83634,0.837665
2,0.479800,0.503605,0.864691,0.863644


TrainOutput(global_step=776, training_loss=0.4176526217116523, metrics={'train_runtime': 126.0021, 'train_samples_per_second': 246.028, 'train_steps_per_second': 30.793, 'total_flos': 203912898739200.0, 'train_loss': 0.4176526217116523, 'epoch': 2.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.426851749420166,
 'eval_accuracy': 0.836340206185567,
 'eval_f1': 0.837665063084125,
 'eval_runtime': 3.0151,
 'eval_samples_per_second': 257.374,
 'eval_steps_per_second': 32.172,
 'epoch': 2.0}

In [None]:
trainer.predict(encoded_dataset['test'])

PredictionOutput(predictions=array([[-2.0681002 ,  3.46236   , -0.42978048],
       [-1.9544924 ,  3.439231  , -1.1881362 ],
       [-3.0843172 ,  2.4484136 ,  1.4423581 ],
       ...,
       [-2.5338833 , -0.62401015,  3.5271802 ],
       [-2.8162806 , -0.13133526,  3.4011376 ],
       [-2.5982037 ,  3.6801853 , -0.4244549 ]], dtype=float32), label_ids=array([1, 1, 1, 2, 0, 1, 2, 2, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2,
       2, 2, 1, 0, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 0, 1, 1,
       1, 2, 2, 1, 1, 1, 0, 1, 2, 1, 0, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 2, 2, 1, 1, 0, 2, 0, 1, 1, 1, 2, 1, 0, 2, 1, 2, 1, 1,
       2, 1, 1, 0, 0, 2, 2, 1, 1, 1, 2, 0, 1, 2, 0, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 0, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 0, 1, 0, 1, 2, 0, 1, 2,
       0, 1, 0, 2, 2, 2, 1, 2, 1, 1, 1, 2, 0, 1, 2, 2, 0, 2, 1, 2, 1, 2,
       0, 1, 2, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 2, 0, 2, 1, 2,