In [None]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch


In [None]:
df = pd.read_csv('path/to/train.csv', encoding='latin-1')


In [None]:
positive_samples = df[df['sentiment'] == 'positive']
negative_samples = df[df['sentiment'] == 'negative']

print(f'Positive samples: {len(positive_samples)}')
print(f'Negative samples: {len(negative_samples)}')


Positive samples: 10500
Negative samples: 12498


In [None]:
balanced_df = pd.concat([
 positive_samples.sample(n=10000, random_state=42),
 negative_samples.sample(n=10000, random_state=42)
])

X = balanced_df['text']
y = balanced_df['sentiment']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")


In [None]:
from datasets import Dataset

train_data = pd.DataFrame({'text': X_train, 'label': y_train})
val_data = pd.DataFrame({'text': X_val, 'label': y_val})

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

le = LabelEncoder()
train_dataset = train_dataset.map(lambda examples: {'labels': le.fit_transform([examples['label']])[0]}, batched=False)
val_dataset = val_dataset.map(lambda examples: {'labels': le.transform([examples['label']])[0]}, batched=False)


In [None]:
def tokenize_function(examples):
 return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=2)


In [None]:
training_args = TrainingArguments(
 output_dir='./results',
 num_train_epochs=12,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=16,
 warmup_steps=500,
 weight_decay=0.01,
 logging_dir='./logs',
 logging_steps=10,
 evaluation_strategy='epoch',
 save_strategy='epoch',
 load_best_model_at_end=True,
 report_to='none'
)

trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=val_dataset,
 compute_metrics=lambda p: {'accuracy': (p.predictions.argmax(-1) == p.label_ids).mean()}
)


In [None]:
trainer.train()
