# Import And Install Modules:

In [None]:
# this notebook is run in google colab

In [None]:
'''
from google.colab import files
files.upload()
'''

In [None]:
# connect to google-drive
def conn_to_drive():
  from google.colab import drive
  drive.mount('/content/drive')
conn_to_drive()
'''
'''

In [None]:
%%capture
!pip3 install transformers==4.26.1
!pip3 install pytorch-lightning
!pip3 install sqlalchemy==2.0.21
!pip3 install mysqlclient==2.2.0
!pip3 install pandas==2.1.0

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

from sqlalchemy import create_engine
from urllib.parse import quote_plus

from transformers import AutoTokenizer

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sns

# Load And Prepare Training Data:

In [None]:
def conn_to_db(db):
  password = "redacted"
  encoded = quote_plus(password)
  db_url = "redacted"
  engine = create_engine(db_url)
  return engine

def download_df(db, table):
  return pd.read_sql_table(table, conn_to_db(db))

def upload_to_db(db, table, df):
    df.to_sql(table, conn_to_db(db), if_exists = 'replace')

def append_to_db(db, table, df, delete):
    df.to_sql(table, conn_to_db(db), if_exists = 'append')

In [None]:
data_sources = {
    "longbets": {"data_pos": None, "data_neg": None},
    "horizons": {"data_pos": None, "data_neg": None},
    "chatgpt": {"data_pos": None, "data_neg": None},
    "news": {"data_pos": None, "data_neg": None},
}

In [None]:
# fill data sources
for data_source_name, data_source in data_sources.items():
  pos_df = download_df("predictions", data_source_name + "_pos")
  pos_df = pos_df.drop(columns=['index'])
  pos_df['label'] = 1
  data_source["data_pos"] = pos_df

  neg_df = download_df("predictions", data_source_name + "_neg")
  neg_df = neg_df.drop(columns=['index'])
  neg_df['label'] = 0
  data_source["data_neg"] = neg_df

In [None]:
# equalize data sources
for data_source_name, data_source in data_sources.items():
    data_pos = data_source["data_pos"]
    data_neg = data_source["data_neg"]

    min_len = min(data_pos.shape[0], data_neg.shape[0])

    data_source["data_pos"] = data_pos.head(min_len)
    data_source["data_neg"] = data_neg.head(min_len)

In [None]:
# visualize data sources
x = []
y1 = []
y2 = []

for data_source_name, data_source in data_sources.items():
  x.append(data_source_name)

  data_pos = data_source["data_pos"]
  data_neg = data_source["data_neg"]

  y1.append(len(data_pos))
  y2.append(len(data_neg))

fig, ax = plt.subplots()

width = 0.4

ax.bar(x, y1, width, label='POSITIVES', color='g')
ax.bar([i + width for i in range(len(x))], y2, width, label='NEGATIVES', color='r')

ax.set_xlabel('Dataset')
ax.set_ylabel('Rows')
ax.legend()

plt.show()

In [None]:
# concat positives and negatives
train_data = []
for data_source_name, data_source in data_sources.items():
  train_data.append(data_source["data_pos"])
  train_data.append(data_source["data_neg"])

train_data = pd.concat(train_data, ignore_index=True)

In [None]:
# export whole dataset
'''
output_data = train_data.loc[:, ['sentence', 'label']]
output_data.to_csv('dataset.csv', index=False)
'''

# Define Dataset

In [None]:
import torch
from torch.utils.data import Dataset

class Pred_Dataset(Dataset):
  def __init__(self, data, tokenizer, max_token_len: int = 64):
    self.data = data
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    item = self.data.iloc[index]
    sentence = item.sentence
    tokens = self.tokenizer.encode_plus(sentence,
                                        add_special_tokens=True,
                                        return_tensors='pt',
                                        truncation=True,
                                        padding='max_length',
                                        max_length=self.max_token_len,
                                        return_attention_mask = True)
    return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'label': item.label}


# Define Data Module

In [None]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader

class Predict_Data_Module(pl.LightningDataModule):
  def __init__(self, train_ds, valid_ds, test_ds, batch_size: int = 16, max_token_length: int = 128,  model_name='roberta-base'):
    super().__init__()
    self.train_ds = train_ds
    self.valid_ds = valid_ds
    self.test_ds = test_ds
    self.batch_size = batch_size
    self.max_token_length = max_token_length
    self.model_name = model_name
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)

  def setup(self, stage = None):
    if stage in (None, "fit"):
      self.train_dataset = Pred_Dataset(self.train_ds, tokenizer=self.tokenizer)
      self.valid_dataset = Pred_Dataset(self.valid_ds, tokenizer=self.tokenizer)
    if stage == 'predict':
      self.test_dataset = Pred_Dataset(self.test_ds, tokenizer=self.tokenizer)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size = self.batch_size, num_workers=4, shuffle=True, drop_last=True)

  def val_dataloader(self):
    return DataLoader(self.valid_dataset, batch_size = self.batch_size, num_workers=4, shuffle=False)

  def predict_dataloader(self):
    return DataLoader(self.test_dataset, batch_size = 1, num_workers=1, shuffle=False)

In [None]:
def prepare_lightning_module(train_ds, valid_ds, test_ds, batch_size):
  data_module = Predict_Data_Module(train_ds, valid_ds, test_ds, batch_size)
  data_module.setup()
  return data_module

# Define Model

In [None]:
from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
import torch.nn as nn
import math
from pytorch_lightning.callbacks import EarlyStopping
from torchmetrics.functional.classification import auroc
import torch.nn.functional as F

class Pred_Sentence_Classifier(pl.LightningModule):

  def __init__(self, config: dict):
    super().__init__()
    self.config = config
    self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)
    self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
    self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
    torch.nn.init.xavier_uniform_(self.classifier.weight)
    self.loss_func = nn.BCEWithLogitsLoss()
    self.dropout = nn.Dropout()

  def forward(self, input_ids, attention_mask, label=None):
    # roberta layer
    output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = torch.mean(output.last_hidden_state, 1)
    # final logits
    pooled_output = self.dropout(pooled_output)
    pooled_output = self.hidden(pooled_output)
    pooled_output = F.relu(pooled_output)
    pooled_output = self.dropout(pooled_output)
    logits = self.classifier(pooled_output)
    # calculate loss
    loss = 0
    if label is not None:
        label = label.unsqueeze(dim=0)
        loss = self.loss_func(logits.view(-1, self.config['n_labels']), label.float().view(-1, self.config['n_labels']))
    return loss, logits

  def training_step(self, batch, batch_index):
    loss, outputs = self(**batch)
    self.log("train loss ", loss, prog_bar = True, logger=True)
    return {"loss":loss, "predictions":outputs, "label": batch["label"]}

  def validation_step(self, batch, batch_index):
    loss, outputs = self(**batch)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return {"val_loss": loss, "predictions":outputs, "label": batch["label"]}

  def predict_step(self, batch, batch_index):
    loss, outputs = self(**batch)
    return outputs

    def on_epoch_start(self):
      print(f"Starting epoch {self.current_epoch + 1}")

    def on_epoch_end(self):
      print(f"Finished epoch {self.current_epoch + 1}")

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
    total_steps = self.config['train_size']/self.config['batch_size']
    warmup_steps = math.floor(total_steps * self.config['warmup'])
    warmup_steps = math.floor(total_steps * self.config['warmup'])
    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
    return [optimizer],[scheduler]

In [None]:
def prepare_config(data_module):
  config = {
      'model_name': 'distilroberta-base',
      'n_labels': 1,
      'batch_size': 64,
      'lr': 1.5e-6,
      'warmup': 0.2,
      'train_size': len(data_module.train_dataloader()),
      'weight_decay': 0.001,
      'n_epochs': 2, #15
  }
  return config

In [None]:
# 4 normal
# 5 without will, with ref
# 6 without will, with ref with a lot of after in negatives
# 10 with new news data (latest)

def load_weights(classifier, version):
  state_dict = torch.load(f"/content/drive/My Drive/Colab Notebooks/chronicle2050/weights/{version}_finetuned_roberta_weights.pt")
  classifier.load_state_dict(state_dict)
  return classifier

In [None]:
def save_weights(classifier, version):
  torch.save(classifier.state_dict(), f"/content/drive/My Drive/Colab Notebooks/chronicle2050/weights/{version}_finetuned_roberta_weights.pt")

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin

def reset_early_stopping(early_stopping_callback):
    early_stopping_callback.wait_count = 0
    early_stopping_callback.stopped_epoch = 0
    early_stopping_callback.best_score = None
    early_stopping_callback.patience = 3

class NNClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, test_size, seed, store_version, load_version):
        self.test_size = test_size
        self.seed = seed
        self.store_version = store_version
        self.load_version = load_version
        self.nn_class_ = None
        self.early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, verbose=True, mode='min')

    def fit(self, X:np.ndarray, y:np.array):
      train_data, valid_data, train_labels, valid_labels = train_test_split(X,
                                                                            y,
                                                                            test_size=self.test_size,
                                                                            random_state=self.seed)
      self.train_data = pd.DataFrame({'sentence': train_data, 'label': train_labels})
      self.valid_data = pd.DataFrame({'sentence': valid_data, 'label': valid_labels})
      self.test_data = pd.DataFrame({'sentence': [''], 'label': [0]})

      self.data_module = prepare_lightning_module(self.train_data, self.valid_data, self.test_data, 8)
      self.config = prepare_config(self.data_module)
      classifier = Pred_Sentence_Classifier(self.config)

      #self.trainer = pl.Trainer(max_epochs=config['n_epochs'], min_epochs=5, num_sanity_val_steps=50, callbacks=[self.early_stop_callback])
      self.trainer = pl.Trainer(max_epochs=self.config['n_epochs'], num_sanity_val_steps=50, val_check_interval=0.05)
      self.trainer.fit(classifier, self.data_module)
      if self.store_version:
        save_weights(classifier, self.store_version)

      self.nn_class_ = classifier
      return self

    def predict(self, X):
        if self.nn_class_ is None and self.load_version is None:
            raise ValueError("The model has not been trained yet and there are no weights to be fetched!")

        #reset_early_stopping(self.early_stop_callback)
        self.train_data = pd.DataFrame({'sentence': [''], 'label': [0]})
        self.valid_data = pd.DataFrame({'sentence': [''], 'label': [0]})
        self.test_data = pd.DataFrame({'sentence': X, 'label': np.zeros(len(X))})
        self.data_module = prepare_lightning_module(self.train_data, self.valid_data, self.test_data, 8)
        if self.nn_class_ is None:
          self.config = prepare_config(self.data_module)
          self.nn_class_ = Pred_Sentence_Classifier(self.config)
        if self.load_version:
          self.nn_class_ = load_weights(self.nn_class_, self.load_version)

        self.trainer = pl.Trainer(max_epochs=self.config['n_epochs'], num_sanity_val_steps=50, val_check_interval=0.05)
        predictions = self.trainer.predict(self.nn_class_, datamodule=self.data_module)
        flattened_predictions = np.stack([torch.sigmoid(torch.Tensor(p)) for batch in predictions for p in batch])
        #flattened_predictions = np.round(flattened_predictions)
        return flattened_predictions

    def score(self, X, y):
        return None


# Train/Test Model

In [None]:
'''
!rm -r ./lightning_logs/
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs/
'''

In [None]:
# TRAIN-TEST-SPLIT

TEST_SIZE = 0.2
SEED = 442

train_data_min = train_data[:30] # for debugging puposes
X_train, X_test, Y_train, Y_test = train_test_split(train_data_min['sentence'].values,
                                                    train_data_min['label'].values,
                                                    test_size=TEST_SIZE,
                                                    random_state=SEED)

In [None]:
# TRAIN Classifier (normal)

clf = NNClassifier(TEST_SIZE, SEED, store_version=None, load_version=None)
X, y = train_data_min['sentence'].values, train_data_min['label'].values
clf.fit(X, y) # TRAIN-TEST-SPLIT gets performed inside

In [None]:
# TRAIN Classifier (CROSS-VALIDATION)

'''
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

folds = 10
scores = cross_validate(clf, X, y, cv=folds, scoring=scoring_metrics)
print(scores)

accuracy_scores = scores['test_accuracy']
precision_scores = scores['test_precision']
recall_scores = scores['test_recall']
f1_scores = scores['test_f1']
roc_auc_scores = scores['test_roc_auc']
print(accuracy_scores, precision_scores, recall_scores, f1_scores, roc_auc_scores)
#print(f"Average Accuracy: {scores.mean():.4f}")

accuracy_scores.mean()
'''

In [None]:
def test_sents():
  return [
    "After being posponed for 30 years, the event might happen in December now",
    "The rockets are going to be destroyed on Monday, following two decades of discussions",
    "After 3 years of fighting, the war will come to an end",
    "He should train more, it is good for him",
    "You could be a professional, if you wanted to",
    "He always thought that he was the best",
    "Elon Musk went to his space shuttle and took off",
    "He predicted in 2002 that two years later it would happen",
    "Trump said he has some new golf clubs coming",
    "Trump said he had some new golf clubs coming yesterday",
    "Trump went into his basement to do some writing",
    "Trump went into his basement to do some writing for his speech in a week",
    "Trump went into his basement to do some writing for his speech in one week",
    "Trump went into his basement to do some writing for his speech in two weeks",
    "Trump went into his basement to do some writing for his speech which he will present in two weeks",
    "Trump has a speech that he presents in a week, but first he needs something to drink",
    "Trump had a speech that he presented in a week, but first he needs something to drink",
    "If he would have been fit he would have won the semifinals",
    "If he is fit he could win the semifinals",
    "Trup went to drink a cofee before going into his office to prepare the speech for the coming rally",
    "He is going into his office to prepare the speech for the coming rally",
    "He went into his office to prepare the speech for the coming rally",
    "He went into his office to prepare the speech for the upcoming rally",
    "He went into his office to prepare the speech for the upcoming rally on Thursday",
    "He prepared the speech for the upcoming rally",
    "He went into his office to prepare the speech for the coming rally next February",
    "He went into his office to prepare the speech for the coming rally in two time units",
    "He went into his office to prepare the speech for the coming rally in Michigan",
    "Trump went into his basement to do some writing for his speech next week",
    "He goes into his basement often to look for what to do for the day",
    "He never really gets up early",
    "He never really gets up early, even if he knows that the weather is good outside",
    "He never really gets up early, even if he knows that the weather might be good outside",
    "Before waking up I always wash my face to prevent injuries",
    "Before waking up I always wash my face to prevent injuries that could come later in life",
    "Before waking up I always wash my face to prevent injuries that will come later in life",
    "Before waking up I always wash my face to prevent injuries that will come when I am older",
    "Before waking up I always wash my face to prevent injuries that might be fatal",
    "In Syria rebels have been fighting for very long, which was always dangerous for civilians, but this might come to an end",
    "In Syria rebels have been fighting for very long, which was always dangerous for civilians, but this might change",
    "Rebels fought relentlessly, but now the Russians have arrived",
    "Rebels fought relentlessly, but now the Russians have arrived and things are different now",
    "Rebels fought relentlessly, but now the Americans have arrived and things could get better",
    "Rebels fought relentlessly, but now the Americans are here and things could get better",
    "Rebels fought relentlessly, and therefore Trump is writing a speech to present in a week at his rally",
    "Rebels fought relentlessly, and therefore Trump writes a speech to present in a week",
    "Trump writes a speech right now",
    "Trump writes a speech right to present at his next rally",
    "Trump writes a speech right to surprise his followers at his upcoming rally",
    "Trump said after his meeting, that new rules are being discussed",
    "Trump said after his meeting, that new rules were discussed",
    "Trump said after his meeting, that new rules were discussed, but not yet implemented",
    "Trump said after his meeting, that things were discussed, but they are not yet implemented",
    "Trump said after his meeting, that things were discussed, but they are not implemented",
    "Trump said after his meeting, that new rules were discussed, which take effect tomorrow",
    "Trump said after his meeting, that new rules have been discussed",
    "Trump said after his meeting, that stock prices shot up recently",
  ]

In [None]:
# TEST Classifier

test_data = X_test
clf = NNClassifier(TEST_SIZE, SEED, store_version=None, load_version=10)
#test_data = test_sents()
test_data = download_df("backend", "elon_musk_sentences")
predicted_labels = clf.predict(test_data["sentence"])

'''
combined_label_sent = zip(test_data["sentence"], predicted_labels)
for sent,label in combined_label_sent:
    print(label, sent)
'''

# Predict With Model

In [None]:
# show density and bar plots of confidence-distribution

fig, axs = plt.subplots(ncols=2, figsize=(10, 3))

sns.kdeplot(predicted_labels, shade=True, ax=axs[0])
axs[0].set_title(f'Density plot of {len(predicted_labels)} news sentences')

plt.hist(predicted_labels, bins=10)
axs[1].set_title(f'Histogram of {len(predicted_labels)} news sentences')

plt.show()

In [None]:
def output_predictions_in_interval(pred_confs, test_data, start, end):
  idx = np.where((pred_confs >= start) & (pred_confs <= end))[0]
  predictions = test_data.loc[idx]
  predictions['confidence'] = pred_confs[idx]

  print("Lenght:", len(test_data.loc[idx]))
  return predictions

# Determine and Upload Positives/Negatives

In [None]:
pos = output_predictions_in_interval(predicted_labels, test_data, 0.9, 1.0)
neg = output_predictions_in_interval(predicted_labels, test_data, 0.0, 0.1)
undet = output_predictions_in_interval(predicted_labels, test_data, 0.11, 0.89)

In [None]:
pos

In [None]:
neg

In [None]:
undet

In [None]:
upload_to_db("backend", "elon_musk_positives", pos)