In [1]:
import transformers
from datasets import Dataset
import pandas as pd
import torch

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True, max_length=128)

def tokenize(text):
    return tokenizer(text['text'],padding='max_length',truncation=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
tokenizer

PreTrainedTokenizer(name_or_path='vinai/bertweet-base', vocab_size=64000, model_max_len=128, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'})

In [4]:
df = pd.read_csv('tweets_remaining_09042020_16072020.csv',sep=';')
# df1 = df[df['label']=='POS'].sample(500)
# df2 = df[df['label']=='NEU'].sample(500)
# df3 = df[df['label']=='NEG']
# df = pd.concat([df1,df2,df3])

In [7]:
df.rename(columns={'full_text':'text'},inplace=True)

In [6]:
df = df.dropna()[['text','label']].reset_index(drop=True)
df.rename(columns={'label':'labels'},inplace=True)
df['labels'] = df['labels'].replace({'POS':2,'NEU':1,'NEG':0})

In [7]:
dataset  = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 2046
})

In [8]:
dataset = dataset.shuffle().train_test_split()
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1534
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 512
    })
})

In [9]:
tokenized_datasets = dataset.map(tokenize, batched=True)
tokenized_datasets.remove_columns(['text'])

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1534
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 512
    })
})

In [10]:
small_train_dataset = tokenized_datasets["train"].shuffle()
small_eval_dataset = tokenized_datasets["test"].shuffle()

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("/fine-tuned/", num_labels=3)

In [9]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch",per_device_train_batch_size=2,
                                 learning_rate=2e-5)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics
)

In [33]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1534
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 2301


Epoch,Training Loss,Validation Loss,Accuracy
1,0.835,0.960425,0.777344
2,0.6134,1.090936,0.769531
3,0.3902,1.112448,0.783203


Saving model checkpoint to test_trainer\checkpoint-500
Configuration saved in test_trainer\checkpoint-500\config.json
Model weights saved in test_trainer\checkpoint-500\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 512
  Batch size = 8
Saving model checkpoint to test_trainer\checkpoint-1000
Configuration saved in test_trainer\checkpoint-1000\config.json
Model weights saved in test_trainer\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to test_trainer\checkpoint-1500
Configuration saved in test_trainer\checkpoint-1500\config.json
Model weights saved in test_trainer\checkpoint-1500\pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequ

TrainOutput(global_step=2301, training_loss=0.6034370697565257, metrics={'train_runtime': 184.6019, 'train_samples_per_second': 24.929, 'train_steps_per_second': 12.465, 'total_flos': 302711987096064.0, 'train_loss': 0.6034370697565257, 'epoch': 3.0})

In [None]:
# trainer.save_model('fine-tuned/')

In [17]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 512
  Batch size = 8


{'eval_loss': 0.774144172668457,
 'eval_accuracy': 0.81640625,
 'eval_runtime': 3.1385,
 'eval_samples_per_second': 163.134,
 'eval_steps_per_second': 20.392}

In [20]:
predict_df = pd.read_csv('filtered.csv')
predict_ds = Dataset.from_csv('filtered.csv')

Using custom data configuration default-7aadb6a8563d5ef1


Downloading and preparing dataset csv/default to C:\Users\alexs\.cache\huggingface\datasets\csv\default-7aadb6a8563d5ef1\0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\alexs\.cache\huggingface\datasets\csv\default-7aadb6a8563d5ef1\0.0.0. Subsequent calls will reuse this data.


In [15]:
predict_ds = Dataset.from_pandas(df)

In [16]:
tokenized_predict = predict_ds.map(tokenize,batch_size=2,batched=True)

  0%|          | 0/461837 [00:00<?, ?ba/s]

In [17]:
prediction = trainer.predict(tokenized_predict)

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: created_at, id, text. If created_at, id, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 923673
  Batch size = 8


In [18]:
predicts = np.argmax(prediction.predictions, axis=-1)

In [21]:
df['predicted_label'] = predicts

In [23]:
df.to_csv('Labelled_old_data.csv')

In [32]:
predict_df.to_csv('Sentiment_Predicted.csv')