In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline
from datasets import load_dataset
import numpy as np
from transformers import TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
#from googletrans import Translator

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [6]:
from datasets import load_dataset
dataset = load_dataset("stanfordnlp/imdb")

In [None]:
dataset["train"][1000]

In [None]:
print(dataset)

#For preprocessin step, first I defined stop_words. Then I write code for preprocessing.

In [7]:
stop_words = set(stopwords.words('english'))

In [8]:
def preprocess_text(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(words)

dataset = dataset.map(lambda x: {'text': preprocess_text(x['text'])})

train_data, val_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2, random_state=42)

#I used a pre-trained BERT model (bert-base-multilingual-cased) for sequence classification. The model is fine-tuned on the dataset. And I applied it below.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

#I splitted dataset as train and test

In [10]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = dataset['train'].map(tokenize_function, batched=True)
val_dataset = dataset['test'].map(tokenize_function, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir='./results_nlp',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
trainer.train()

#After here, I loaded sentimentDataset as csv file.

In [13]:
data = pd.read_csv('sentimentDataset.csv', encoding='latin-1', header=None)
data.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [15]:
import torch

In [31]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.labels = dataset['target'].values
        self.texts = dataset['tokens'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.texts[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item

#Since the dataset is very large, I used 10000 samples to avoid problems in the runtime. From now on dataset is as subset_data.

In [32]:
subset_data = data.sample(n=10000, random_state=42)

In [None]:
print(subset_data.columns)
print(subset_data.head())

In [34]:
subset_data['label'] = subset_data['target'].map({0: 0, 4: 1})

In [35]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [36]:
stop_words = set(nltk.corpus.stopwords.words('english'))

In [37]:
subset_data['tokens'] = subset_data['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

In [None]:
print("Columns in Sentiment140 dataset:", subset_data.head)

In [None]:
missing_values = subset_data.isnull()
print(missing_values.all())

In [40]:
from accelerate import Accelerator
from transformers import Trainer

In [41]:
test_dataset = SentimentDataset(subset_data)

In [42]:
from transformers import DataCollatorForLanguageModeling

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    #data_collator=data_collator,
    #accelerator=accelerator
)

#I run all steps from here and, here is the prediction step.

In [44]:
predictions = trainer.predict(test_dataset)

100%|██████████| 1250/1250 [01:29<00:00, 14.03it/s]


In [45]:
preds = predictions.predictions.argmax(-1)

In [46]:
labels = predictions.label_ids

#Calculating accuracy of model

In [47]:
accuracy = accuracy_score(labels, preds)

#Calculating precision, recall and F1 score of model

In [None]:
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

#I printed following model and found that 
Accuracy: 0.3652
Precision: 0.2459239402503028
Recall: 0.3652
F1 Score: 0.293922123039807

In [None]:
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')