In [1]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import pipeline
import pandas as pd
import numpy as np
import torch

classifier = pipeline('text-classification', model='nickmuchi/distilbert-base-movie-genre-prediction')

In [3]:

df = pd.read_csv('/Users/sandundesilva/Documents/4th year/Research Project/UI/findMyFilm/flask-server/Models/final/GenreDataset/bert_train.csv')


le = LabelEncoder()
df['genre'] = le.fit_transform(df['genre'])


train, validate, test = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

train = train.reset_index()[['synopsis','genre']]
test = test.reset_index()[['synopsis','genre']]
validate = validate.reset_index()[['synopsis','genre']]


train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)
validation_dataset = Dataset.from_pandas(validate)

dataset = DatasetDict()
dataset['train'] = train_dataset
dataset['test'] = test_dataset
dataset['validation'] = validation_dataset


In [4]:
from imblearn.over_sampling import RandomOverSampler


from sklearn.utils import shuffle


train = shuffle(train)


X_train = train['synopsis']
y_train = train['genre']


ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train.values.reshape(-1, 1), y_train)


train_resampled = pd.DataFrame(X_train_resampled, columns=['synopsis'])
train_resampled['genre'] = y_train_resampled

train_dataset = Dataset.from_pandas(train_resampled)


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['synopsis', 'genre'],
        num_rows: 32400
    })
    test: Dataset({
        features: ['synopsis', 'genre'],
        num_rows: 10800
    })
    validation: Dataset({
        features: ['synopsis', 'genre'],
        num_rows: 10800
    })
})

In [6]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [7]:
def tokenize(batch):
    return tokenizer(batch["synopsis"], padding=True, truncation=True)

In [8]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/32400 [00:00<?, ? examples/s]

Map:   0%|          | 0/10800 [00:00<?, ? examples/s]

Map:   0%|          | 0/10800 [00:00<?, ? examples/s]

In [9]:
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [10]:
def extract_hidden_states(batch):

    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}

    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state

    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [11]:
dataset_encoded.set_format("torch", 
                            columns=["input_ids", "attention_mask", "genre"])

In [12]:
dataset_hidden = dataset_encoded.map(extract_hidden_states, batched=True)

Map:   0%|          | 0/32400 [00:00<?, ? examples/s]

Map:   0%|          | 0/10800 [00:00<?, ? examples/s]

Map:   0%|          | 0/10800 [00:00<?, ? examples/s]

In [13]:
dataset_hidden["train"].column_names

['synopsis', 'genre', 'input_ids', 'attention_mask', 'hidden_state']

In [60]:
num_labels = 2
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [15]:
batch_size = 64
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False, 
                                  log_level="error")

In [16]:

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["validation"],
                  tokenizer=tokenizer)

In [64]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6752,0.655287,0.599138,0.44895
2,0.6376,0.602318,0.689655,0.634082
3,0.573,0.558319,0.75,0.736929
4,0.4964,0.528356,0.767241,0.766808
5,0.4315,0.553471,0.74569,0.733015
6,0.4092,0.540427,0.75431,0.752524
7,0.3446,0.555217,0.767241,0.760164
8,0.3501,0.554095,0.771552,0.76738
9,0.3047,0.562684,0.75431,0.748193
10,0.28,0.563908,0.758621,0.753029


TrainOutput(global_step=110, training_loss=0.43623759746551516, metrics={'train_runtime': 14.8718, 'train_samples_per_second': 468.0, 'train_steps_per_second': 7.397, 'total_flos': 84634248922560.0, 'train_loss': 0.43623759746551516, 'epoch': 10.0})

In [17]:
trainer.save_model("./genrepredict")


In [67]:
from transformers import pipeline

classifier = pipeline('text-classification', model='./genrepredict')

result = classifier("i will kill you")
print(result)

[{'label': 'LABEL_1', 'score': 0.6755982041358948}]
