In [None]:
import pandas as pd
import os
import openai
from sklearn.metrics import accuracy_score

from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
)

In [None]:
os.getcwd()

In [None]:
os.chdir('..')

In [None]:
data = pd.read_csv('data/sentiment/annotated/gpt35/gpt35_annotated_400.csv', index_col = 0 )

In [None]:
data

In [None]:
unannotated = pd.read_csv('data/sentiment/unannotated/unannotated_sentiment_dataset.csv', encoding= 'unicode_escape', index_col=[0])
original_dataset = pd.read_csv('data/sentiment/original/train.csv', encoding= 'unicode_escape')

In [None]:
with open('openai/organization.txt', 'r') as file:
    openai.organization = file.read().strip()

with open('openai/key.txt', 'r') as file:
    openai.api_key = file.read().strip()

In [None]:
accumulated_tokens_method1 = 0
accumulated_cost_method1 = 0
cost_per_token = 0.0035 / 1000  # The total cost per token, input and output
index = 0

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def analyze_davinci(text):
    global index
    global accumulated_tokens_method1
    global accumulated_cost_method1

    prompt = f"Sentiment analysis for the following text in a single number: 1 for positive, 0 for neutral, 2 for negative: \"{text}\""

    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=10,
        temperature=0
    )

    total_tokens_used = response['usage']['total_tokens']
    print(f"Total tokens used for this call: {total_tokens_used}")

    call_cost = total_tokens_used * cost_per_token
    accumulated_cost_method1 += call_cost
    accumulated_tokens_method1 += total_tokens_used
    index += 1
    print('\nIndex: ', index)
    print(f"Cost for this call: {call_cost}")
    print(f"Accumulated tokens so far: {accumulated_tokens_method1}")
    print(f"Accumulated cost so far: {accumulated_cost_method1}")

    response_text = response.choices[0].text.strip().lower()

    return response_text


In [None]:
accumulated_tokens = 0
accumulated_cost = 0
cost_per_token = 0.0035 / 1000  # The total cost per token, input and output
index = 0 

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def analyze_gpt35(text):
    global index
    global accumulated_cost
    global accumulated_tokens
    messages = [
        {"role": "system", "content": """You are trained to analyze and detect the sentiment of given text. Here are some examples:
                                            User: 'I love this!'; You: 'positive'
                                            User: 'I hate this!'; You: 'negative'
                                            User: 'I don't feel strongly about this.'; You: 'neutral' """},
        {"role": "user", "content": f"""Analyze the following texts and determine if the sentiment is: positive, negative, or neutral.
                                        Return answer in a single word as either 'positive', 'negative', or 'neutral': '{text}'"""}
        ]

    response = openai.ChatCompletion.create(
                      model="gpt-3.5-turbo",
                      messages=messages,
                      max_tokens=10,
                      n=1,
                      stop=None,
                      temperature=0)

    total_tokens_used = response['usage']['total_tokens']
    print(f"Total tokens used for this call: {total_tokens_used}")

    call_cost = total_tokens_used * cost_per_token
    accumulated_cost += call_cost
    accumulated_tokens += total_tokens_used
    index+=1
    print('Index: ', index)
    print(f"Cost for this call: {call_cost}")
    print(f"Accumulated tokens so far: {accumulated_tokens}")
    print(f"Accumulated cost so far: {accumulated_cost}\n")

    response_text = response.choices[0].message.content.strip().lower()

    return response_text


In [None]:
llm_annotated_data = unannotated.copy()

In [None]:
llm_annotated_data

In [None]:
num_rows = 1000

In [None]:
#llm_annotated_data['predicted_gpt35'] = llm_annotated_data['text'].iloc[0:num_rows].apply(analyze_gpt35)
llm_annotated_data['predicted_davinci'] = llm_annotated_data['text'].iloc[0:num_rows].apply(analyze_davinci)

In [None]:
sentiments = {'positive': 1, 'neutral': 0, 'negative': 2}

In [None]:
original_dataset.loc[0:num_rows - 1, 'sentiment'] = original_dataset['sentiment'].iloc[0:num_rows].map(sentiments)


In [None]:
original_dataset

In [None]:
num_rows

In [None]:
llm_annotated_data

In [None]:
llm_annotated_data.iloc[0:num_rows]

In [None]:
#llm_annotated_data['annotation_correct'] = (llm_annotated_data['predicted_gpt35'].iloc[0:num_rows] == original_dataset['sentiment'].iloc[0:num_rows]).astype(int)

llm_annotated_data['annotation_correct'] = (llm_annotated_data['predicted_davinci'].iloc[0:num_rows].astype(int) == original_dataset['sentiment'].iloc[0:num_rows].astype(int)).astype(int)

In [None]:
llm_annotated_data

In [None]:
llm_annotated_data.iloc[0:num_rows].to_csv('data/sentiment/gpt35_annotated.csv')

In [None]:
#llm_annotated_data['predicted_gpt35'] = llm_annotated_data['text'].iloc[0:num_rows].apply(analyze_gpt35)

In [None]:
#print(f"Accuracy of GPT3.5's annotations: {accuracy_score(original_dataset['sentiment'].iloc[0:num_rows].astype('str').values, llm_annotated_data['predicted_gpt35'].iloc[0:num_rows].astype('str').values)}")

print(f"Accuracy of Davinci 003's annotations: {accuracy_score(original_dataset['sentiment'].iloc[0:num_rows].astype('str').values, llm_annotated_data['predicted_davinci'].iloc[0:num_rows].astype('str').values)}")

In [None]:
#llm_annotated_data.iloc[0:num_rows].to_csv('data/sentiment/gpt35_annotated.csv')

In [None]:
dataset = pd.read_csv('data/sentiment/davinci003_annotated_300.csv', index_col=[0])

In [None]:
dataset['predicted_davinci'] = dataset['predicted_davinci'].apply(lambda x: x.replace('.', ''))

In [None]:
dataset['predicted_davinci'].value_counts()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [None]:
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 16
EPOCHS = 3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
data = dataset.copy()
sentiments = {'positive': 0, 'neutral': 1, 'negative': 2}
data['predicted_davinci'] = data['predicted_davinci'].map(sentiments)

In [None]:
data['predicted_davinci']

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
train_texts, val_texts, train_targets, val_targets = train_test_split(data['text'], data['predicted_davinci'], test_size=0.1)

train_texts = train_texts.reset_index(drop=True)
val_texts = val_texts.reset_index(drop=True)
train_targets = train_targets.reset_index(drop=True)
val_targets = val_targets.reset_index(drop=True)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

train_data = SentimentDataset(train_texts, train_targets, tokenizer, max_len=128)
val_data = SentimentDataset(val_texts, val_targets, tokenizer, max_len=128)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(sentiments)).to(DEVICE)

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
def train_epoch(model, data_loader, optimizer, device, scheduler=None):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if scheduler:
            scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, device):
    model = model.eval()

    correct_predictions = 0
    predictions = []
    real_values = []

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds)
            real_values.extend(targets)
            correct_predictions += torch.sum(preds == targets)

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return correct_predictions.double() / len(data_loader.dataset), classification_report(real_values, predictions, target_names=sentiments.keys())

In [None]:
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_loader, optimizer, DEVICE)
    print(f'Train loss: {train_loss}, accuracy: {train_acc}')

    val_acc, val_report = eval_model(model, val_loader, DEVICE)
    print(f'Val accuracy: {val_acc}\n')
    print(val_report)

In [None]:
torch.save(model.state_dict(), 'models/davinci/bert_sentiment_davinci003_300_model.pt')

In [None]:
test = pd.read_csv('data/sentiment/test.csv', encoding= 'unicode_escape')

In [None]:
test.drop(columns=[x for x in test.columns if x != 'text' and x != 'sentiment'], inplace=True)

In [None]:
test.dropna(subset=['sentiment'], inplace=True)

In [None]:
test.sentiment.value_counts()

In [None]:
for i in test.columns:
    test[i] = test[i].astype('str')

In [None]:
test_texts = test['text'].reset_index(drop=True)
test_targets = test['sentiment'].reset_index(drop=True)

In [None]:
if isinstance(test_targets[0], str):
    label_encoder = LabelEncoder()
    test_targets = label_encoder.fit_transform(test_targets)

In [None]:
test_data = SentimentDataset(test_texts, test_targets, tokenizer, max_len=128)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
test_acc, test_report = eval_model(model, test_loader, DEVICE)
print(f'Test accuracy: {test_acc}')
print(test_report)