# **Installation and Imports**

In [None]:
!pip install transformers --quiet
!pip install huggingface_hub --quiet
!pip install -U accelerate --quiet

!pip install -U huggingface-hub --quiet
!pip install datasets==2.13 --quiet
# !pip install nlpaug

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from IPython.display import FileLink, FileLinks
from datasets import Dataset, load_dataset, concatenate_datasets
import sklearn
from sklearn.metrics import accuracy_score
import os
import numpy as np
import torch
import math
# from torch.utils.data import DataLoader
# import nlpaug.augmenter.word as naw

## **Notebook Login**

In [None]:
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token(<your_token>)"

from huggingface_hub import notebook_login
notebook_login()

# **Dataset Loading**

In [None]:
train_dataset = load_dataset("Yunij/tokenized_datasets", split="train")
test_dataset = load_dataset("Yunij/tokenized_datasets", split="test")

df_train = Dataset.to_pandas(train_dataset)
df_test = Dataset.to_pandas(test_dataset)

In [None]:
df_train.rename(columns={'label': 'human/ai'}, inplace=True)
df_test.rename(columns={'label': 'human/ai'}, inplace=True)

df_train = df_train.drop(['input_ids', 'attention_mask'], axis=1)
df_test = df_test.drop(['input_ids', 'attention_mask'], axis=1)

all_sources = df_train['source'].unique().tolist()
num_labels = len(all_sources)
id2label = {key: value for key, value in enumerate(all_sources)}
label2id = {value: key for key, value in id2label.items()}

In [None]:
df_train['source'] = df_train['source'].map(label2id)
df_test['source'] = df_test['source'].map(label2id)

# **Text Cleaning**

In [None]:
import re
import nltk
import string
import subprocess

nltk.download('stopwords')
# nltk.download('wordnet')
nltk.download('omw-1.4')

# For downloading wordnet in kaggle because normal method doesn't work for wordnet in kaggle
# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
punctuations = string.punctuation
lemmatizer = WordNetLemmatizer()

stopword = stopwords.words('english')
new_stop = [re.sub('[^a-z]', '', word) for word in stopword] #doesn't --> doesnt, can't --> cant
stopword.extend(new_stop)
stopword = list(set(stopword)) #removing duplicates

In [None]:
def remove_urls(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(pattern, '', text)

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return re.sub(html_pattern, '', text)

def text_cleaning(text):
    text = text.replace("\n", ' ') #removing next line
    text = remove_urls(text) #removing urls
    text = remove_html(text) #removing html tags
    text = re.sub(r"-", " ", text) #nearest-neighbor --> nearest neighbor, finite-size --> finite size
    text = re.sub(r"\$[^$]*\$", "", text) #removing formulas written as $F = ma$
    text = re.sub('[^a-zA-Z ]', '', text.lower()) #removing all except alphabets and spaces and changing each letter into lower case
    text = re.sub('( . )', ' ', text) #removing a single character word
    # text = text.translate(str.maketrans('', '', punctuations)) #removing punctuations
    text = " ".join([lemmatizer.lemmatize(word) for word in str(text).split() if word not in stopword]) #removing stopwords and lemmatizing
    text = text.strip() #removing trailing spaces
    return text

In [None]:
df_train['cleaned_text'] = df_train['text'].apply(lambda x: text_cleaning(x))
df_test['cleaned_text'] = df_test['text'].apply(lambda x: text_cleaning(x))

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# **Fine Tuning RoBERTa**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                         num_labels=num_labels, 
                                                         id2label=id2label, 
                                                         label2id=label2id)

In [None]:
def tokenize_text(examples):
    return tokenizer(examples["cleaned_text"], truncation=True, max_length=512)

In [None]:
train_dataset = train_dataset.map(tokenize_text, batched=True)
test_dataset = test_dataset.map(tokenize_text, batched=True)

In [None]:
train_dataset.push_to_hub("hc3-wiki-cleaned-text-for-domain-classification-roberta-tokenized-max-len-512", split="train")
test_dataset.push_to_hub("hc3-wiki-cleaned-text-for-domain-classification-roberta-tokenized-max-len-512", split="test")

# **Checkpoint for training** 

In [None]:
train_dataset = load_dataset("rajendrabaskota/hc3-wiki-cleaned-text-for-domain-classification-roberta-tokenized-max-len-512", split="train")
test_dataset = load_dataset("rajendrabaskota/hc3-wiki-cleaned-text-for-domain-classification-roberta-tokenized-max-len-512", split="test")

df_train = Dataset.to_pandas(train_dataset)
df_test = Dataset.to_pandas(test_dataset)

all_sources = df_train['source'].unique().tolist()
num_labels = len(all_sources)
id2label = {key: value for key, value in enumerate(all_sources)}
label2id = {value: key for key, value in id2label.items()}

train_dataset = train_dataset.rename_column("source", "label")
test_dataset = test_dataset.rename_column("source", "label")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained('roberta-base', 
                                                         num_labels=num_labels, 
                                                         id2label=id2label, 
                                                         label2id=label2id)

In [None]:
class_weights = (1 - (df_train["source"].value_counts().sort_index() / (len(df_train)+len(df_test)))).values
class_weights = torch.from_numpy(class_weights).float().to("cuda")

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
batch_size = 16
epochs = 1
output_dir = "hc3-wiki-domain-classification-roberta"
logging_steps = len(train_dataset) // batch_size
training_args = TrainingArguments(output_dir,
                                  num_train_epochs=epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="steps",
                                  eval_steps=400,
                                  logging_strategy="steps",
                                  logging_steps=400,
                                  learning_rate=5e-5,
                                  weight_decay=0.01,
#                                   save_strategy="no"
                                  save_steps=400,
#                                   load_best_model_at_end=True,
                                  save_total_limit=2,
#                                   push_to_hub=False
                                 )

In [None]:
test_accuracy_scores = []
test_f1_scores = []
train_accuracy_scores = []
train_f1_scores = []

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    print(f"labels: {labels.shape}")
    print(f"preds: {preds.shape}")
    accuracy = accuracy_score(labels, preds)
    f1 = sklearn.metrics.f1_score(labels, preds, average='micro')
    test_accuracy_scores.append(accuracy)
    test_f1_scores.append(f1)
    
    return {'accuracy': accuracy, 'f1_score': f1}

In [None]:
trainer = WeightedLossTrainer(model=model, 
                              args=training_args,
                              compute_metrics=compute_metrics,
                              train_dataset=train_dataset,
                              eval_dataset=test_dataset,
                              tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
FileLinks(f"hc3-wiki-domain-classification-roberta/checkpoint-10200")

In [None]:
trainer.push_to_hub("hc3-wiki-domain-classification-roberta-1-epoch")

# **Result and Inference**

In [None]:
model_ckpt = "rajendrabaskota/hc3-wiki-domain-classification-roberta"

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

trainer = Trainer(model=model, 
                  tokenizer=tokenizer)

In [None]:
train_dataset = load_dataset("rajendrabaskota/hc3-wiki-cleaned-text-for-domain-classification-roberta-tokenized-max-len-512", split="train")
test_dataset = load_dataset("rajendrabaskota/hc3-wiki-cleaned-text-for-domain-classification-roberta-tokenized-max-len-512", split="test")

df_train = Dataset.to_pandas(train_dataset)
df_test = Dataset.to_pandas(test_dataset)

In [None]:
mean_perplexities = df_train[['source', 'perplexity']].groupby('source').mean().to_dict()
mean_perplexities = mean_perplexities['perplexity']

In [None]:
mean_perplexities

In [None]:
sum = 0
for key, value in mean_perplexities.items():
    sum += value
    
print(sum/6.0)

In [None]:
id2label = model.config.id2label
mean = {}
for key, value in mean_perplexities.items():
    label = id2label[key]
    mean[label] = value
    
print(mean)

In [None]:
def calculate_metrics(df, dataset):
    predictions, labels, _ = trainer.predict(dataset)
    prediction_source = np.argmax(predictions, axis=-1)
    
    df['predicted_source'] = prediction_source
    df['predicted_label'] = df.apply(lambda row: 1 if row['perplexity'] <= mean_perplexities[row['predicted_source']] else 0, axis=1)
    
    accuracy = accuracy_score(df['human/ai'], df['predicted_label'])
    f1_score = sklearn.metrics.f1_score(df['human/ai'], df['predicted_label'], average='binary')
    
    return accuracy, f1_score

In [None]:
train_accuracy, train_f1 = calculate_metrics(df_train, train_dataset)
print(f"Train Accuracy: {train_accuracy}, Train F1: {train_f1}")
test_accuracy, test_f1 = calculate_metrics(df_test, test_dataset)
print(f"Test Accuracy: {test_accuracy}, Test F1: {test_f1}")

In [None]:
def compute_metrics_source_wise(df):
    accuracies = []
    f1_scores = []
    for i in range(6):
        df_temp = df[df['source'].isin([i])]
        accuracy = accuracy_score(df_temp['human/ai'], df_temp['predicted_label'])
        f1_score = sklearn.metrics.f1_score(df_temp['human/ai'], df_temp['predicted_label'], average='binary')
        
        accuracies.append(accuracy)
        f1_scores.append(f1_score)
        
    return accuracies, f1_scores

In [None]:
train_accuracies, train_f1_scores = compute_metrics_source_wise(df_train)
test_accuracies, test_f1_scores = compute_metrics_source_wise(df_test)

In [None]:
train_accuracies, train_f1_scores

In [None]:
test_accuracies, test_f1_scores

In [None]:
final_train_acc = {}
final_train_f1 = {}
final_test_acc = {}
final_test_f1 = {}

for i, source in enumerate(all_sources):
    final_train_acc[source] = train_accuracies[i]
    final_test_acc[source] = test_accuracies[i]
    final_train_f1[source] = train_f1_scores[i]
    final_test_f1[source] = test_f1_scores[i]

In [None]:
print(f"Train Accuracies: {final_train_acc}")
print(f"Train F1: {final_train_f1}")
print(f"Test Accuracies: {final_test_acc}")
print(f"Test F1: {final_test_f1}")