In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers.trainer_callback import TrainerCallback, PrinterCallback

#
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

#
import torch

def read_data(data_path):
    # Read the CSV file into a pandas DataFrame
    sampled_data = pd.read_csv(data_path, encoding='latin1')

    # Replace all occurrences of "4" with "1" in the labels
    sampled_data.iloc[:, 0].replace(4, 1, inplace=True)


    # Extract the first column into a separate array
    labels = sampled_data.iloc[:, 0].values

    # Print all unique values in the labels
    unique_labels = sampled_data.iloc[:, 0].unique()
    print("Unique labels:", unique_labels)

    # Extract the last column into a separate array
    tweets = sampled_data.iloc[:, -1].values

    return labels, tweets

def split_into_train_dev_test(labels, tweets, test_size=0.1, dev_size=0.1, random_state=42):
    # Split the data into training and testing sets
    train_tweets, test_tweets, train_labels, test_labels = train_test_split(
        tweets, labels, test_size=test_size, shuffle=True, random_state=random_state)

    # Further split the training data into training and development sets
    train_tweets, dev_tweets, train_labels, dev_labels = train_test_split(
        train_tweets, train_labels, test_size=dev_size, shuffle=True, random_state=random_state)

    return train_tweets, dev_tweets, test_tweets, train_labels, dev_labels, test_labels


def encode_using_bert_tokenizer(train_tweets, test_tweets, dev_tweets):
    train_tweets = [str(tweet) for tweet in train_tweets]
    test_tweets = [str(tweet) for tweet in test_tweets]
    dev_tweets = [str(tweet) for tweet in dev_tweets]

    print("Attempting to tokenize...")
    train_encodings = tokenizer(train_tweets, truncation=True, padding=True)
    test_encodings = tokenizer(test_tweets, truncation=True, padding=True)
    dev_encodings = tokenizer(dev_tweets, truncation=True, padding=True)

    train_encodings = dict(train_encodings)
    test_encodings = dict(test_encodings)
    dev_encodings = dict(dev_encodings)
    print("Successfully tokenized.")

    return train_encodings, test_encodings, dev_encodings





class PolarityDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
         # Creating the item dictionary

        item = {}
        item['input_ids'] = torch.tensor(self.encodings['input_ids'][idx])
        item['attention_mask'] = torch.tensor(self.encodings['attention_mask'][idx])
        item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.labels)




from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report

class CustomPrinterCallback(TrainerCallback):
    def __init__(self):
        super().__init__()

    def on_train_begin(self, args, state, control, **kwargs):
        print("Training begins!")

    def on_train_batch_end(self, args, state, control, **kwargs):
        if state.global_step % args.logging_steps == 0:
            print(f"Step {state.global_step}:")
            print(f"  Train loss: {state.log_history[-1]['loss']}")
            print(f"  Train accuracy: {state.log_history[-1]['accuracy'] * 100:.2f}%")

def train_and_evaluate(train_dataset, val_dataset, test_dataset):
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=6,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
    )

    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset,            # evaluation dataset
        callbacks=[CustomPrinterCallback()]  # custom printer callback
    )

    # Train the model
    trainer.train()

    # Evaluate on the test set
    predictions = trainer.predict(test_dataset)
    predicted_labels = predictions.predictions.argmax(axis=1)
    true_labels = test_dataset['labels']  # Assuming 'label' is the column name for labels in the test dataset

    # Calculate metrics
    report = classification_report(true_labels, predicted_labels, output_dict=True)

    return report


#this is the data set sampled from the bigger data size
#it has a size of 10K
#This file path should be the file that path
#that is modified for the other trainings
path = 'path to data'
labels, tweets = read_data(path)
#split into train, test, and dev set
train_tweets, dev_tweets, test_tweets, train_labels, dev_labels, test_labels = split_into_train_dev_test(labels, tweets)

#encode using the bert tokenizer
train_encodings, test_encodings, dev_encodings = encode_using_bert_tokenizer(train_tweets, test_tweets, dev_tweets)

#create dataset
train_dataset = PolarityDataset(train_encodings, train_labels)
dev_dataset = PolarityDataset(dev_encodings, dev_labels)
test_dataset = PolarityDataset(test_encodings, test_labels)


#access the report and display it for visualization
report = train_and_evaluate(train_dataset, dev_dataset, test_dataset)
print(report)



In [None]:
from transformers import pipeline

# Load the model fine-tuned for sentiment analysis on Twitter data
sentiment_analysis = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

# Example tweet
tweet = "susan boyle come second place! she deserve to win!<3 😉"

# Make a prediction
result = sentiment_analysis(tweet)

# Print the result
print(result)