In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CosineSimilarityLoss

from datasets import Dataset, load_dataset
from setfit import SetFitModel, SetFitTrainer

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import os

In [2]:
pwd

'/Users/siddharthmandgi/Desktop/Data-Science-Universe/Projects/Twitter-Sentiment-Analysis'

# Data

### 1. Training Data

In [3]:
train = pd.read_csv("./twitter_training.csv", header=None)
train.rename(columns={0:'tweet_id', 1:'entity', 2:'sentiment', 3:'tweet'},inplace=True)
train = train.dropna().reset_index(drop=True)
train_id = train['tweet_id']
train['sentiment_label'] = train['sentiment'].astype('category').cat.codes
train = train[['tweet', 'sentiment']]
train

Unnamed: 0,tweet,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive
...,...,...
73991,Just realized that the Windows partition of my...,Positive
73992,Just realized that my Mac window partition is ...,Positive
73993,Just realized the windows partition of my Mac ...,Positive
73994,Just realized between the windows partition of...,Positive


### 2. Validation Data

In [4]:
val = pd.read_csv("./twitter_validation.csv", header=None)
val.rename(columns={0:'tweet_id', 1:'entity', 2:'sentiment', 3:'tweet'},inplace=True)
val_id = val['tweet_id']
val = val[['tweet', 'sentiment']]
val['sentiment_label'] = val['sentiment'].astype('category').cat.codes
val

Unnamed: 0,tweet,sentiment,sentiment_label
0,I mentioned on Facebook that I was struggling ...,Irrelevant,0
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral,2
2,@Microsoft Why do I pay for WORD when it funct...,Negative,1
3,"CSGO matchmaking is so full of closet hacking,...",Negative,1
4,Now the President is slapping Americans in the...,Neutral,2
...,...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant,0
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant,0
997,Today sucked so it’s time to drink wine n play...,Positive,3
998,Bought a fraction of Microsoft today. Small wins.,Positive,3


# Sentence Transformer Model

In [5]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')

### 1. Train embeddings

In [6]:
train_embeddings = model.encode(train['tweet'])
train_embeddings


KeyboardInterrupt



In [None]:
train_embeddings_dataframe = pd.DataFrame(train_embeddings)
train_embeddings_dataframe['tweet_id'] = train_id
train_embeddings_dataframe

In [None]:
train_embeddings_dataframe.to_csv("train_embeddings.csv", index=False)

### 2. Validation embeddings

In [None]:
val_embeddings = model.encode(val['tweet'])
val_embeddings

In [None]:
val_embeddings_dataframe = pd.DataFrame(val_embeddings)
val_embeddings_dataframe['tweet_id'] = val_id
val_embeddings_dataframe

In [None]:
val_embeddings_dataframe.to_csv("val_embeddings.csv", index=False)

### 3. Classification Head 
- Run from Here

In [None]:
train_embeddings_dataframe = pd.read_csv("train_embeddings.csv")
val_embeddings_dataframe = pd.read_csv("val_embeddings.csv")

In [None]:
X_train = train_embeddings_dataframe.drop(['tweet_id'],axis=1)
y_train = train['sentiment_label']

X_val = val_embeddings_dataframe.drop(['tweet_id'],axis=1)
y_val = val['sentiment_label']

In [None]:
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [None]:
print(classification_report(y_val, classifier.predict(X_val)))

# Finetuning with SetFit on unique tweet ids (few samples)
- 100 Samples of each sentiment of unique tweet ids

In [None]:
np.random.seed(42)

train_unique = train
train_unique['tweet_id'] = train_id



train_unique = train_unique.sample(frac=1.0)

# get first of every tweet in tweet_id
train_unique = train_unique.groupby("tweet_id").head(1000)

# Take only 100 samples of each class (sentiment)
train_unique = train_unique.groupby("sentiment").head(1000)

train_unique

### 1. Preparing Torch Dataset

In [None]:
my_dict = {'sentence':train_unique['tweet'], 'label': train_unique['sentiment_label']}

train_dataset = Dataset.from_dict(my_dict)
train_dataset

In [None]:
my_dict = {'sentence':val['tweet'], 'label': val['sentiment_label']}

val_dataset = Dataset.from_dict(my_dict)
val_dataset

### 2. Load Finetune Trainer

In [None]:
# Load a SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for contrastive learning
    column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
)

# Train and evaluate
trainer.train()

### 3. Save Model

In [None]:
path = "./saved_models"
isExist = os.path.exists(path)
if not isExist:

   # Create a new directory because it does not exist
   os.makedirs(path)
    
    
trainer.model._save_pretrained("/saved_models/finetuned_paraphrase-mpnet-base-v2")

### 4. Metrics

In [None]:
saved_model = SetFitModel._from_pretrained("finetuned_paraphrase-mpnet-base-v2")

In [None]:
train_embeddings = saved_model.model_body.encode(train['tweet'])
train_embeddings_dataframe = pd.DataFrame(train_embeddings)
train_embeddings_dataframe['tweet_id'] = train_id

val_embeddings = saved_model.model_body.encode(val['tweet'])
val_embeddings_dataframe = pd.DataFrame(val_embeddings)
val_embeddings_dataframe['tweet_id'] = val_id

In [None]:
train_embeddings_dataframe.to_csv("finetuned_train_embeddings.csv", index=False)
val_embeddings_dataframe.to_csv("finetuned_val_embeddings.csv", index=False)

In [None]:
X_train = train_embeddings_dataframe.drop(['tweet_id'],axis=1)
y_train = train['sentiment_label']
X_val = val_embeddings_dataframe.drop(['tweet_id'],axis=1)
y_val = val['sentiment_label']
            
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
            
print(classification_report(y_val, classifier.predict(X_val)))