In [None]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import os

In [None]:
import pandas as pd
import numpy as np

In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(42)

In [None]:
model_name = 'SALT-NLP/FLANG-Roberta'
PATH = "."

In [None]:
data = pd.read_excel(PATH + "CRED_with_tweets.xlsx", dtype={'retweet_count_x_more_y': np.int32})

data['idx'] = data.index

dt1 = "2022-04-30"
df_train = data[(data['Datetime_x']<=dt1) & (data['Datetime_y']<=dt1)]
df_test = data[(data['Datetime_x']>dt1) & (data['Datetime_y']>dt1)]

df_train = df_train.reset_index(drop=True).drop(['Datetime_x', 'Datetime_y'], axis = 1)
df_test = df_test.reset_index(drop=True).drop(['Datetime_x', 'Datetime_y'], axis = 1)


model_save_path = PATH + 'crossencoder_whitehouse_flangroberta_snli_sbert_with_claude'
df_train['blank'] = ' '
df_test['blank'] = ' '

df_train['cleaned_tweet_text_x'] = df_train['cleaned_tweet_text_x'] + df_train['blank'] + df_train['claude_x_cleaned']
df_train['cleaned_tweet_text_y'] = df_train['cleaned_tweet_text_y'] + df_train['blank'] + df_train['claude_y_cleaned'] 
df_test['cleaned_tweet_text_x'] = df_test['cleaned_tweet_text_x'] + df_test['blank'] + df_test['claude_x_cleaned']
df_test['cleaned_tweet_text_y'] = df_test['cleaned_tweet_text_y'] + df_test['blank'] + df_test['claude_y_cleaned']


df_train = df_train[['cleaned_tweet_text_x', 'cleaned_tweet_text_y', 'retweet_count_x_more_y', 'idx']].rename(columns = {'cleaned_tweet_text_x':'sentence1', 'cleaned_tweet_text_y':'sentence2', 'retweet_count_x_more_y':'label'}).copy()
df_test = df_test[['cleaned_tweet_text_x', 'cleaned_tweet_text_y', 'retweet_count_x_more_y', 'idx']].rename(columns = {'cleaned_tweet_text_x':'sentence1', 'cleaned_tweet_text_y':'sentence2', 'retweet_count_x_more_y':'label'}).copy()

In [None]:
num_labels = df_train['label'].nunique()

train_samples = []
for sent1, sent2, label in zip(df_train['sentence1'], df_train['sentence2'], df_train['label']):
  train_samples.append(InputExample(texts=[sent1, sent2], label=label))


dev_samples = []
for sent1, sent2, label in zip(df_test['sentence1'], df_test['sentence2'], df_test['label']):
  dev_samples.append(InputExample(texts=[sent1, sent2], label=label))

In [None]:
train_batch_size = 16
num_epochs = 20


#Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
model = CrossEncoder(model_name, num_labels=num_labels)

#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

#During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples, name='cross_encoder_flangroberta')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=10000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


In [None]:
scores = model.predict([(sent1, sent2) for sent1, sent2 in zip(df_test['sentence1'].tolist(), df_test['sentence2'].tolist())])

#Convert scores to labels
predicted_labels = [score_max for score_max in scores.argmax(axis=1)]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

print(accuracy_score(df_test['label'].tolist(), predicted_labels))
print(f1_score(df_test['label'].tolist(), predicted_labels))

0.7194719471947195
0.7310126582278481
