In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
def load_imdb_data(data_file):
    df = data_file
    texts = df['text'].tolist()
    labels = df['label'].tolist()
    return texts, labels

In [None]:
df = pd.read_csv("/kaggle/input/semval/output.csv")

In [None]:
label_counts = df["label"].value_counts()

=
print(label_counts)

In [None]:
data_file = df
texts, labels = load_imdb_data(data_file)

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [None]:
from transformers import RobertaModel

class RoBERTaClassifier(nn.Module):
    def __init__(self, roberta_model_name, num_classes):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Access logits from the output

        # Compute loss
        loss = nn.CrossEntropyLoss()(logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()


In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Extract logits from the outputs
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Access logits from the output
        _, preds = torch.max(logits, dim=1)

        return "Machine" if preds.item() == 1 else "Human"


In [None]:
roberta_model_name = "roberta-base"
num_classes = 2
max_length = 128
batch_size = 16
learning_rate = 1e-5
num_epochs = 1

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, shuffle = True)



In [None]:
zero = 0
one = 0
for i,j in zip(train_texts,train_labels):
    if j==0:
        zero+=1
    else:
        one+=1

print(zero)
print(one)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(roberta_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(roberta_model_name, num_labels=num_classes).to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False, no_deprecation_warning=True)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    print("done training")
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

In [None]:
torch.save(model.state_dict(), "robert_classifier.pth")

In [None]:
# Defining a function to load data: The load_imdb_data function is defined to load your data from a given file. It reads the ‘text’ and ‘label’ columns from the DataFrame and returns them as lists.

# Loading and processing the data: You’re loading a CSV file into a pandas DataFrame. Then, you’re creating a new DataFrame by concatenating the first 50000 and the last 50000 rows of the original DataFrame. You’re also displaying the last 50 rows of the new DataFrame. After that, you’re printing the value counts of the ‘label’ column in the DataFrame. Finally, you’re loading the texts and labels from the DataFrame using the load_imdb_data function.

# Creating a custom Dataset class: You’re defining a custom Dataset class TextClassificationDataset for your text classification task. This class takes in texts, labels, a tokenizer, and a max_length as inputs. It tokenizes the texts and returns the input_ids, attention_mask, and label for each text.

# Creating a BERT Classifier: You’re defining a BERT Classifier BERTClassifier which is a subclass of nn.Module. This classifier uses a pre-trained BERT model and a linear layer for classification. The forward method of this class takes in input_ids and attention_mask, and returns the logits.

# Defining training and evaluation functions: You’re defining a train function to train your model and an evaluate function to evaluate your model’s performance. The train function takes in a model, a data loader, an optimizer, a scheduler, and a device as inputs, and trains the model. The evaluate function takes in a model, a data loader, and a device as inputs, and returns the accuracy score and the classification report.

# Defining a function for sentiment prediction: You’re defining a predict_sentiment function that takes a text, a model, a tokenizer, a device, and a max_length as inputs, and returns the predicted sentiment (“Machine” or “Human”) for the text.

# Setting parameters: You’re setting some parameters for your task, including the BERT model name, the number of classes, the max_length for tokenization, the batch size, the number of epochs, and the learning rate.

# Splitting the data into training and validation sets: You’re splitting your texts and labels into training and validation sets using the train_test_split function from sklearn.

In [None]:
# Counting the labels: You’re counting the number of instances for each label in your training data.

# Tokenizing the texts: You’re initializing a tokenizer from the pre-trained BERT model and using it to tokenize the texts in your training and validation sets.

# Creating DataLoaders: You’re creating PyTorch DataLoaders for your training and validation datasets. These DataLoaders will be used to feed data into your model during training and evaluation.

# Setting up the device: You’re setting up the device (GPU if available, otherwise CPU) for training your model.

# Initializing the model: You’re initializing your BERT Classifier and moving it to the device.

# Setting up the optimizer and scheduler: You’re setting up the AdamW optimizer with your model’s parameters and the learning rate. You’re also setting up a learning rate scheduler.

# Training and evaluating the model: You’re training your model for a certain number of epochs. After each epoch, you’re evaluating your model on the validation set and printing the validation accuracy and the classification report.

# Saving the model: You’re saving the state dictionary of your model to a file.

# Predicting the sentiment: You’re asking the user to enter a text, predicting the sentiment of this text using your trained model, and printing the predicted sentiment.