In [None]:
import pandas as pd
import os
import openai
from sklearn.metrics import accuracy_score

from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
)

In [None]:
os.getcwd()

In [None]:
os.chdir('..')

In [None]:
unannotated = pd.read_csv('data/unannotated/unannotated_sentiment_dataset.csv', encoding= 'unicode_escape', index_col=[0])
original_dataset = pd.read_csv('data/original/train.csv', encoding= 'unicode_escape')

In [None]:
with open('openai/organization.txt', 'r') as file:
    openai.organization = file.read().strip()

with open('openai/key.txt', 'r') as file:
    openai.api_key = file.read().strip()

In [None]:
accumulated_tokens_method1 = 0
accumulated_cost_method1 = 0
cost_per_token = 0.0035 / 1000  # The total cost per token, input and output
index = 0

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def analyze_ada(text):
    global index
    global accumulated_tokens_method1
    global accumulated_cost_method1

    prompt = f"Sentiment analysis for the following text in a single number: 1 for positive, 0 for neutral, 2 for negative: \"{text}\""

    response = openai.Completion.create(
        engine="text-ada-001",
        prompt=prompt,
        max_tokens=3,
        temperature=0.2
    )

    total_tokens_used = response['usage']['total_tokens']
    print(f"Total tokens used for this call: {total_tokens_used}")

    call_cost = total_tokens_used * cost_per_token
    accumulated_cost_method1 += call_cost
    accumulated_tokens_method1 += total_tokens_used
    index += 1
    print('\nIndex: ', index)
    print(f"Cost for this call: {call_cost}")
    print(f"Accumulated tokens so far: {accumulated_tokens_method1}")
    print(f"Accumulated cost so far: {accumulated_cost_method1}")

    response_text = response.choices[0].text.strip().lower()
    print('response: ', response_text)
    return response_text


In [None]:
@retry(wait=wait_random_exponential(min=1, max=120), stop=stop_after_attempt(6))
def analyze_gpt35(text):
    global index
    global accumulated_cost
    global accumulated_tokens
    messages = [
        {"role": "system", "content": "Your task is to analyze text and classify its sentiment as either 'positive', 'negative', or 'neutral' in a single word."},
        {"role": "user", "content": f"Classify the sentiment of: '{text}'."}
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=3,
        n=3,
        temperature=0.5
    )

    total_tokens_used = response['usage']['total_tokens']
    print(f"Total tokens used for this call: {total_tokens_used}")

    call_cost = total_tokens_used * cost_per_token
    accumulated_cost += call_cost
    accumulated_tokens += total_tokens_used
    index += 1
    print('Index: ', index)
    print(f"Cost for this call: {call_cost}")
    print(f"Accumulated tokens so far: {accumulated_tokens}")
    print(f"Accumulated cost so far: {accumulated_cost}\n")

    response_texts = [choice.message.content.strip().lower() for choice in response.choices]
    primary_response = response_texts[0]
    confidence_score = response_texts.count(primary_response) / 3

    return primary_response, confidence_score

In [None]:
llm_annotated_data = unannotated.copy()

In [None]:
num_rows = 50

In [None]:
#llm_annotated_data['predicted_gpt35'] = llm_annotated_data['text'].iloc[0:num_rows].apply(analyze_gpt35)
#llm_annotated_data['predicted_label'] = llm_annotated_data['text'].iloc[0:num_rows].apply(analyze_ada)

In [None]:
llm_annotated_data

In [None]:
sentiments_and_scores = llm_annotated_data['text'].iloc[0:num_rows].apply(analyze_gpt35)

In [None]:
llm_annotated_data.loc[llm_annotated_data.index[0:num_rows], 'predicted_labels'] = [x[0] for x in sentiments_and_scores]
llm_annotated_data.loc[llm_annotated_data.index[0:num_rows], 'confidence_score'] = [x[1] for x in sentiments_and_scores]
llm_annotated_data['annotation_correct'] = (llm_annotated_data['predicted_labels'] == original_dataset['sentiment']).astype(str)

In [None]:
llm_annotated_data

In [None]:
#llm_annotated_data.to_csv('data/sentiment/annotated/ada/ada_500.csv')

In [None]:
#original_dataset.loc[0:num_rows - 1, 'sentiment'] = original_dataset['sentiment'].iloc[0:num_rows].map(sentiments)

In [None]:
#original_dataset

In [None]:
llm_annotated_data.iloc[0:num_rows]

In [None]:
#llm_annotated_data['annotation_correct'] = (llm_annotated_data['predicted_gpt35'].iloc[0:num_rows] == original_dataset['sentiment'].iloc[0:num_rows]).astype(int)

llm_annotated_data['annotation_correct'] = (llm_annotated_data['predicted_labels'].iloc[0:num_rows].astype(int) == original_dataset['sentiment'].iloc[0:num_rows].astype(int)).astype(int)

In [None]:
llm_annotated_data

In [None]:
llm_annotated_data.iloc[0:num_rows].to_csv('data/sentiment/gpt35_annotated.csv')

In [None]:
#llm_annotated_data['predicted_gpt35'] = llm_annotated_data['text'].iloc[0:num_rows].apply(analyze_gpt35)

In [None]:
#print(f"Accuracy of GPT3.5's annotations: {accuracy_score(original_dataset['sentiment'].iloc[0:num_rows].astype('str').values, llm_annotated_data['predicted_gpt35'].iloc[0:num_rows].astype('str').values)}")

print(f"Accuracy of Davinci 003's annotations: {accuracy_score(original_dataset['sentiment'].iloc[0:num_rows].astype('str').values, llm_annotated_data['predicted_davinci'].iloc[0:num_rows].astype('str').values)}")

In [None]:
#llm_annotated_data.iloc[0:num_rows].to_csv('data/sentiment/gpt35_annotated.csv')

In [None]:
dataset = pd.read_csv('data/sentiment/davinci003_annotated_300.csv', index_col=[0])

In [None]:
dataset['predicted_davinci'] = dataset['predicted_davinci'].apply(lambda x: x.replace('.', ''))

In [None]:
dataset['predicted_davinci'].value_counts()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [None]:
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 16
EPOCHS = 3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
data = dataset.copy()
sentiments = {'positive': 0, 'neutral': 1, 'negative': 2}
data['predicted_davinci'] = data['predicted_davinci'].map(sentiments)

In [None]:
data['predicted_davinci']

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import numpy as np
import mlflow
import mlflow.pytorch
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

from sklearn.preprocessing import LabelEncoder



class SentimentDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

def train_epoch(model, data_loader, optimizer, device, scheduler=None):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == targets)

        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if scheduler:
            scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, device, sentiments):
    model = model.eval()

    correct_predictions = 0
    predictions = []
    real_values = []

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds)
            real_values.extend(targets)
            correct_predictions += torch.sum(preds == targets)

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return correct_predictions.double() / len(data_loader.dataset), classification_report(real_values, predictions, target_names=sentiments.keys())

In [None]:
def train_bert(model_path, data_path):

    experiment_name = "llm_seminar_data_annotation"
    client = MlflowClient()
    experiment_id = client.get_experiment_by_name(experiment_name)
    if experiment_id is None:
        experiment_id = mlflow.create_experiment(experiment_name)
    else:
        experiment_id = experiment_id.experiment_id

    model_name = "_".join(model_path.split("/")[-1].split("_")[:-2]) # 'bert_sentiment_gpt35_1000' for your example path
    with mlflow.start_run(experiment_id=experiment_id):
        DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        MODEL_NAME = 'bert-base-uncased'
        BATCH_SIZE = 16
        EPOCHS = 1

        mlflow.log_param("batch_size", BATCH_SIZE)
        mlflow.log_param("epochs", EPOCHS)
        mlflow.log_param("model_name", MODEL_NAME)

        sentiments = {'positive': 0, 'neutral': 1, 'negative': 2}

        data = pd.read_csv(data_path, index_col=[0]) #LLM Annotated Dataset
        data['predicted_labels'] = data['predicted_labels'].map(sentiments)

        train_texts, val_texts, train_targets, val_targets = train_test_split(data['text'], data['predicted_labels'], test_size=0.2)

        train_texts = train_texts.reset_index(drop=True)
        val_texts = val_texts.reset_index(drop=True)
        train_targets = train_targets.reset_index(drop=True)
        val_targets = val_targets.reset_index(drop=True)

        tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

        train_data = SentimentDataset(train_texts, train_targets, tokenizer, max_len=128)
        val_data = SentimentDataset(val_texts, val_targets, tokenizer, max_len=128)
        train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)

        model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(sentiments)).to(DEVICE)

        # Train
        optimizer = AdamW(model.parameters(), lr=2e-5)
        for epoch in range(EPOCHS):
            print(f'Epoch {epoch + 1}/{EPOCHS}')
            print('-' * 10)

            train_acc, train_loss = train_epoch(model, train_loader, optimizer, DEVICE)
            mlflow.log_metric("train_acc", train_acc)
            mlflow.log_metric("train_loss", train_loss)

            print(f'Train loss: {train_loss}, accuracy: {train_acc}')

            val_acc, val_report = eval_model(model, val_loader, DEVICE, sentiments)
            mlflow.log_metric("val_acc", val_acc)

            print(f'Val accuracy: {val_acc}\n')
            #print(val_report)

        torch.save(model, model_path)
        result = mlflow.pytorch.log_model(model, "model")

        mlflow.pytorch.log_model(model, "model")
        mlflow.register_model(
            model_uri=f"runs:/{mlflow.active_run().info.run_id}/model",
            name=model_name
        )
        ALL_RUNS_INFO = client.list_run_infos(experiment_id)
        ALL_RUNS_ID = [run.run_id for run in ALL_RUNS_INFO]
        ALL_METRIC = [client.get_run(run_id).data.metrics["val_acc"] for run_id in ALL_RUNS_ID]
        
        runs = pd.DataFrame({"Run ID": ALL_RUNS_ID, "Metrics": ALL_METRIC})
        print(runs.columns)

        if runs.empty:
            print("No runs found.")
            return
        
        best_run_id = runs.sort_values("Metrics", ascending=False).iloc[0]["Run ID"]
        best_val_acc = runs.sort_values("Metrics", ascending=False).iloc[0]["Metrics"]


        # Fetch and load the best model
        try:
            model_version = client.get_latest_versions(model_name, stages=["None", "Staging", "Production"])[0].version
            model_uri = f"models:/{model_name}/{model_version}"
            model = mlflow.pytorch.load_model(model_uri)
        except Exception as e:
            print(f"Error fetching model: {e}")
            return

        # Print the model details
        print(f"Model Name: {model_name}")
        print(f"Model Version: {model_version}")
        print(f"Best Validation Accuracy: {best_val_acc:.2f}%")


    mlflow.end_run()


In [None]:
train_bert('models/bert_sentiment_gpt35_1000_model2.pt', 'data/annotated/gpt35_conf_scores_1000_preproc.csv')

In [None]:
test = pd.read_csv('data/sentiment/test.csv', encoding= 'unicode_escape')

In [None]:
test.drop(columns=[x for x in test.columns if x != 'text' and x != 'sentiment'], inplace=True)

In [None]:
test.dropna(subset=['sentiment'], inplace=True)

In [None]:
test.sentiment.value_counts()

In [None]:
for i in test.columns:
    test[i] = test[i].astype('str')

In [None]:
test_texts = test['text'].reset_index(drop=True)
test_targets = test['sentiment'].reset_index(drop=True)

In [None]:
if isinstance(test_targets[0], str):
    label_encoder = LabelEncoder()
    test_targets = label_encoder.fit_transform(test_targets)

In [None]:
test_data = SentimentDataset(test_texts, test_targets, tokenizer, max_len=128)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
test_acc, test_report = eval_model(model, test_loader, DEVICE)
print(f'Test accuracy: {test_acc}')
print(test_report)