# Translator Classification - multilingual BERT + Sentences (instead of paragraphs)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

In [None]:
import torch

if torch.cuda.is_available():
  dev = "cuda"
else:
  dev = "cpu"
device = torch.device(dev)

print(device)

In [None]:
torch.cuda.is_available()

In [None]:
import glob

data_path = 'russian_lit_data'
copy_path = 'copyrighted'
noncopy_path = 'uncopyrighted'
rus_path = 'russian'

copy_files = glob.glob(data_path + '/' + copy_path + '/*.txt')
noncopy_files = glob.glob(data_path + '/' + noncopy_path + '/*.txt')
files = copy_files + noncopy_files

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# create dict: translator to books
# NotesFromUnderground - Katz, PV, Garnett
# PoorFolk - McDuff, Hogarth, Garnett

translator_to_sents = {}
translator_to_sents_holdout = {}
holdout_books = ['PoorFolk', 'NotesFromUnderground']
for f in files:
    book_title = (f.split('/')[-1]).split('.')[0]
    book_name = book_title.split('_')[0]
    translator = book_title.split('_')[1]

    if translator not in translator_to_sents.keys():
        translator_to_sents[translator] = []
        translator_to_sents_holdout[translator] = []

    with open(f, "r") as fp:
        book_text = fp.read()
        book_pars = book_text.split('\n')
        sents = []

        for par in book_pars:
            if len(par) > 40: # and len(par) < int(490/2):
                par_sents = nltk.tokenize.sent_tokenize(par)
                long_sents = [s for s in par_sents if len(s) > 40]
                if book_name in holdout_books:
                    translator_to_sents_holdout[translator].extend(long_sents)
                else:
                    translator_to_sents[translator].extend(long_sents)

In [None]:
total = 0
print('Train')
for k in translator_to_sents.keys():
    print(k, len(translator_to_sents[k]))
    total += len(translator_to_sents[k])
print('Total', total)

total = 0
print('\nHoldout')
for k in translator_to_sents_holdout.keys():
    print(k, len(translator_to_sents_holdout[k]))
    total += len(translator_to_sents_holdout[k])
print('Total', total)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(list(translator_to_sents.keys()))
print(le.transform(["Garnett", "McDuff", "PV", "Katz", "Hogarth"]))

In [None]:
# for each translator
# split each book into sentences with sent_tokenizer
# drop sentences shorter than 6 words
# add to data dict: {'idx': 0, 'label': 1, 'sentence': "Our friends won't buy this analysis, let alone the next one we propose."}
data_list = []
i = 0
for tr in translator_to_sents.keys():
    label = le.transform([tr])[0]
    for s in translator_to_sents[tr]:
        sent_dict = {'idx': i, 'labels': label, 'sentence': s}
        data_list.append(sent_dict)
        i += 1

data_list_holdout = []
i = 0
for tr in translator_to_sents_holdout.keys():
    label = le.transform([tr])[0]
    for s in translator_to_sents_holdout[tr]:
        sent_dict = {'idx': i, 'labels': label, 'sentence': s}
        data_list_holdout.append(sent_dict)
        i += 1
        
print(data_list[:10])

In [None]:
import pandas as pd

df = pd.DataFrame(data_list)
df_holdout = pd.DataFrame(data_list_holdout)
df.head()

In [None]:
# from sklearn.utils import shuffle
# df = shuffle(df)
# df = df.head(100000)
df.shape
df_holdout.shape

In [None]:
from sklearn.model_selection import train_test_split

# train_texts = df['sentence'].values.tolist()
# train_labels = df['labels'].values.tolist()
test_texts = df_holdout['sentence'].values.tolist()
test_labels = df_holdout['labels'].values.tolist()

train_texts, _, train_labels, _ = train_test_split(
    df['sentence'].values.tolist(), df['labels'].values.tolist(),
    stratify = df['labels'], train_size=0.55, shuffle=True
)

print('train size: ', len(list(train_labels)))
print('test size: ', len(list(test_labels)))

sentences = {}
sentences['train'] = []
sentences['test'] = []
for t, l in zip(train_texts, train_labels):
    datum = {'label': l, 'text': t}
    sentences['train'].append(datum)
for t, l in zip(test_texts, test_labels):
    datum = {'label': l, 'text': t}
    sentences['test'].append(datum)

print(sentences['train'][4])

In [None]:
BERT_MODEL = "bert-base-multilingual-cased"

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
import datasets

train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=sentences['train']))
test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=sentences['test']))

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
label_list = translator_to_sents.keys()
id_list = le.transform(list(label_list))

id2label = {}
label2id = {}
for l, i in zip(label_list, id_list):
    id2label[i] = l
    label2id[l] = i

print(id2label)
print(label2id)

In [None]:
fine_tune = False

In [None]:
from numpy import mean
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# device (turn on GPU acceleration for faster execution)
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

if fine_tune:
    # model
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels = len(translator_to_sents.keys()))
    model.to(device)

In [None]:
lr = 2e-5
epochs = 10
batch_size = 16

In [None]:
import wandb

run = wandb.init(
        # Set the project where this run will be logged
        project="translator-classification",
        # Track hyperparameters and run metadata
        config={
            "learning_rate": lr,
            "epochs": epochs,
        },
    )

os.environ["WANDB_PROJECT"]="translator-classification"
os.environ["WANDB_NOTEBOOK_NAME"]="translator_classification"

In [None]:
if fine_tune:
    training_args = TrainingArguments(
        output_dir="/trunk/kkatsy/classification_10epochs_holdout",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to="wandb"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

In [None]:
if fine_tune:
    trainer.train()

In [None]:
if fine_tune:
    trainer.evaluate()
    wandb.finish()

In [None]:
load_tuned = True

In [None]:
if load_tuned:
    model = AutoModelForSequenceClassification.from_pretrained("/trunk/kkatsy/classification_10epochs_holdout/checkpoint-10512")
    model.to(device)

In [None]:
if load_tuned:
    training_args = TrainingArguments(
        output_dir="/trunk/kkatsy/classification_10epochs_holdout",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to="wandb"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

In [None]:
predictions, labels, metrics = trainer.predict(tokenized_test, metric_key_prefix="predict")

In [None]:
preds = np.argmax(predictions, axis=-1)

In [None]:
translators = [id2label[l] for l in preds]

pred_count = {}
for i in label2id.keys():
    count = translators.count(i)
    pred_count[i] = count

pred_count

In [None]:
import matplotlib.pyplot as plt


courses = list(pred_count.keys())
values = list(pred_count.values())
  
fig = plt.figure(figsize = (10, 5))
 
# creating the bar plot
plt.bar(courses, values, width = 0.4)
plt.title("Predicted Distribution")
plt.show()

In [None]:
true_count = {}
for i in id2label.keys():
    count = test_labels.count(i)
    true_count[id2label[i]] = count

true_count

In [None]:
courses = list(true_count.keys())
values = list(true_count.values())
  
fig2 = plt.figure(figsize = (10, 5))
 
# creating the bar plot
plt.bar(courses, values, width = 0.4)
plt.title("Actual Distribution")
plt.show()

In [None]:
# confusion matrix
import matplotlib.pyplot as plt
import numpy
from sklearn import metrics

confusion_matrix = metrics.confusion_matrix(labels, preds, normalize='true')

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels=label2id)
cm_display.plot()
plt.show()