In [1]:
from transformers import BertModel, BertTokenizer, BertConfig, BertForSequenceClassification
import os
import pandas as pd
import torch
import transformers
import time
import datetime
import numpy as np
import random
import os
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

In [2]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
datasets = [
     {
         "name": "Stanford treebank",
         "prefix": "stanford_treebank",
         "train_path": "/data/sam/stanford_treebank/sst_train.csv",
         "dev_path": "/data/sam/stanford_treebank/sst_dev.csv",
         "test_path": "/data/sam/stanford_treebank/sst_test.csv",
         'classes': ['neg', 'pos']
     },
#    {
#        "name": "Reddit Dataset",
#        "prefix": "reddit_dataset",
#        "train_path": "/data/rajat/ex_ml/reddit/input/train.csv",
#        "dev_path": "/data/rajat/ex_ml/reddit/input/dev.csv",
#        "test_path": "/data/rajat/ex_ml/reddit/input/test.csv",
#        'classes': [0, 1]
#    }
]

0

0      1
1      1
2      1
3      1
4      1
      ..
867    0
868    0
869    0
870    0
871    0
Name: classification, Length: 872, dtype: int64

In [28]:
def get_output_dir_name(parameters):
    return './outputs_{}_lr={}_epochs={}'.format(datasets[0]['prefix'], parameters['learning_rate'], epochs)

def read_data():
    train = pd.read_csv(datasets[0]['train_path'],index_col=0)
    dev = pd.read_csv(datasets[0]['dev_path'],index_col=0)
    test = pd.read_csv(datasets[0]['test_path'],index_col=0)
    return train, dev, test
train_data, dev, test = read_data()

In [5]:
def encode(data, tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'), **kwargs):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    try:
        for x in data:
            tokenized_x = tokenizer.encode_plus(x,
                                                max_length=128,
                                                add_special_tokens = True,
                                                pad_to_max_length=True,
                                                padding_side='right',
                                                return_token_type_ids=True,
                                                return_attention_mask=True)
            input_ids.append(tokenized_x['input_ids'])
            token_type_ids.append(tokenized_x['token_type_ids'])
            attention_mask.append(tokenized_x['attention_mask'])
    except Exception as e:
        print(e, x)
    return torch.tensor(input_ids, dtype=torch.long), torch.tensor(token_type_ids, dtype=torch.long), torch.tensor(attention_mask, dtype=torch.long)

In [18]:
def get_batches(df, batch_size=4, **kwargs):
    x, y = list(df['text'].values), torch.tensor(list(df['classification'].apply(lambda y: datasets[0]['classes'].index(y))), device=device, dtype=torch.long)
    input_ids, token_type_ids, attention_mask = encode(x, **kwargs)
    tensor_dataset = torch.utils.data.TensorDataset(input_ids, token_type_ids, attention_mask, y)
    tensor_randomsampler = torch.utils.data.RandomSampler(tensor_dataset)
    tensor_dataloader = torch.utils.data.DataLoader(tensor_dataset, sampler=tensor_randomsampler, batch_size=batch_size)
    return tensor_dataloader

In [7]:
def train(batch, epochs=2):
    model.train()
    for e in range(epochs):
        for i, batch_tuple in enumerate(batch):
            batch_tuple = (t.to(device) for t in batch_tuple)
            input_ids, token_type_ids, attention_mask, labels = batch_tuple
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
            loss, logits, hidden_states_output, attention_mask_output = outputs
            if i % 100 == 0:
                print("loss - {0}".format(loss))
            model.zero_grad()        
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), parameters['max_grad_norm'])
            optimizer.step()
            scheduler.step()

In [8]:
#batch_dev = get_batches(dev, batch_size=1, tokenizer=tokenizer)
#batch_train = get_batches(train_data, batch_size=8, tokenizer=tokenizer)
#batch_test = get_batches(test, batch_size=1, tokenizer=tokenizer)

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
batch_dev = get_batches(dev, batch_size=1, tokenizer=tokenizer)
batch_train = get_batches(train_data, batch_size=8, tokenizer=tokenizer)
batch_test = get_batches(test, batch_size=1, tokenizer=tokenizer)

epochs=2
parameters = {
    'learning_rate': 2e-5,
    'num_warmup_steps': 1000,
    'num_training_steps': len(batch_train) * epochs,
    'max_grad_norm': 1,
    'epochs': epochs
}
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_hidden_states=True, output_attentions=True)
#model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True, output_attentions=True)
model.to(device)
optimizer = transformers.AdamW(model.parameters(), lr=parameters['learning_rate'], correct_bias=False)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                         num_warmup_steps=parameters['num_warmup_steps'],
                                                         num_training_steps=parameters['num_training_steps'])


In [20]:
def evaluate(batch):
    results = []
    model.eval()
    for i, batch_cpu in enumerate(batch):
        batch_gpu = (t.to(device) for t in batch_cpu)
        input_ids, token_type_ids, attention_mask, labels = batch_gpu
        with torch.no_grad():
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
            loss, logits, hidden_states_output, attention_mask_output = outputs
            loss = loss.detach().cpu()
            logits =  logits.detach().cpu()
            labels_cpu = labels.detach().cpu()
            input_ids_cpu = input_ids.detach().cpu()
#             hidden_states_output = tuple(t.detach().cpu() for t in hidden_states_output)
#             attention_mask_output = tuple(t.detach().cpu() for t in attention_mask_output)
#             results.append(tuple([loss, logits, hidden_states_output, attention_mask_output]))
            results.append(tuple([loss, logits, labels_cpu, input_ids_cpu, None, None]))
    return results

In [21]:
train(batch_train, epochs=epochs)

loss - 0.7048430442810059
loss - 0.6011022329330444
loss - 0.3259471654891968
loss - 0.09624850004911423
loss - 0.33943769335746765
loss - 0.1105036810040474
loss - 0.013818517327308655
loss - 0.45614898204803467
loss - 0.2383934110403061
loss - 0.005538195371627808
loss - 0.004141062498092651
loss - 0.03702165186405182
loss - 0.007148325443267822
loss - 0.8673495054244995
loss - 0.003480762243270874
loss - 0.0038371384143829346
loss - 0.005284488201141357
loss - 0.0041169822216033936


In [27]:
def get_prediction(results):
    predictions = []
    true_labels = []
    count = 0
    for result in results:
        loss, logits, labels, input_ids, hidden_states_output, attention_mask_output = result
        prediction = torch.argmax(logits, dim=1).tolist()
        if prediction[0] != labels[0]:
            count += 1
        predictions += prediction
        true_labels += labels.tolist()
    print("wrong: {}".format(count))
    return true_labels, predictions

## Evaluation for Reddit data

In [100]:
results = evaluate(batch_dev)


In [101]:
y_true, y_pred = get_prediction(results)
target_names = ["negative", "positive"]
classification = confusion_matrix(y_true, y_pred)
precision, recall, fbeta_scorefloat, support = precision_recall_fscore_support(y_true, y_pred)
classification = classification_report(y_true, y_pred, target_names=target_names, digits=4)
print(precision, recall, fbeta_scorefloat, support)
print(classification)

[1] [0] [CLS] treasure is fucking you [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[1] [0] [CLS] stop whining chris [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [102]:
results = evaluate(batch_test)
y_true, y_pred = get_prediction(results)
target_names = ["negative", "positive"]
classification = classification_report(y_true, y_pred, target_names=target_names, digits=4)
print(classification)

[1] [0] [CLS] not to sound racist but seeing black people portrayed this was makes me so happy than how the news portrays black people [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[1] [0] [CLS] lol what a shit [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [31]:
def save():
    output_dir = get_output_dir_name(parameters)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print("Saving model to {}".format(output_dir))
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [108]:
save()

Saving model to {0} ./outputs_reddit_lr=2e-05_epochs=2


## For Stanford Treebank

In [25]:
results = evaluate(batch_dev)
y_true, y_pred = get_prediction(results)
target_names = ["negative", "positive"]
classification = classification_report(y_true, y_pred, target_names=target_names, digits=4)
print(classification)

wrong: 79
              precision    recall  f1-score   support

    negative     0.9051    0.9136    0.9093       428
    positive     0.9159    0.9077    0.9118       444

    accuracy                         0.9106       872
   macro avg     0.9105    0.9106    0.9105       872
weighted avg     0.9106    0.9106    0.9106       872



In [26]:
results = evaluate(batch_test)
y_true, y_pred = get_prediction(results)
target_names = ["negative", "positive"]
classification = classification_report(y_true, y_pred, target_names=target_names, digits=4)
print(classification)

wrong: 152
              precision    recall  f1-score   support

    negative     0.9242    0.9090    0.9165       912
    positive     0.9102    0.9252    0.9176       909

    accuracy                         0.9171      1821
   macro avg     0.9172    0.9171    0.9171      1821
weighted avg     0.9172    0.9171    0.9171      1821



In [32]:
save()

Saving model to ./outputs_stanford_treebank_lr=2e-05_epochs=2


In [37]:
train_data

Unnamed: 0,id,original_text,text,classification,rationale
0,0,The Rock is destined to be the 21st Century 's...,the rock is destined to be the 21st century 's...,pos,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,The gorgeously elaborate continuation of `` Th...,the gorgeously elaborate continuation of `` th...,pos,"[0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,Singer/composer Bryan Adams contributes a slew...,singer/composer bryan adams contributes a slew...,pos,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4,Yet the act is still charming here .,yet the act is still charming here .,pos,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
4,5,Whether or not you 're enlightened by any of D...,whether or not you 're enlightened by any of d...,pos,"[0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
6915,8539,A real snooze .,a real snooze .,neg,"[0.0, 0.5, 1.0, 0.0]"
6916,8540,No surprises .,no surprises .,neg,"[0.5, 0.5, 0.0]"
6917,8541,We 've seen the hippie-turned-yuppie plot befo...,we 've seen the hippie-turned-yuppie plot befo...,pos,"[0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, ..."
6918,8542,Her fans walked out muttering words like `` ho...,her fans walked out muttering words like `` ho...,neg,"[0.0, 0.5, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


In [38]:
dev

Unnamed: 0,id,original_text,text,classification,rationale
0,0,It 's a lovely film with lovely performances b...,it 's a lovely film with lovely performances b...,pos,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
1,2,And if you 're not nearly moved to tears by a ...,and if you 're not nearly moved to tears by a ...,pos,"[0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 1.0, 1.0, 1.0, ..."
2,3,"A warm , funny , engaging film .","a warm , funny , engaging film .",pos,"[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0]"
3,4,Uses sharp humor and insight into human nature...,uses sharp humor and insight into human nature...,pos,"[0.0, 0.5, 1.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, ..."
4,6,"Entertains by providing good , lively company .","entertains by providing good , lively company .",pos,"[0.5, 0.0, 0.0, 0.5, 0.0, 0.5, 0.0, 0.0]"
...,...,...,...,...,...
867,1095,... Designed to provide a mix of smiles and te...,... designed to provide a mix of smiles and te...,neg,"[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
868,1096,it seems to me the film is about the art of ri...,it seems to me the film is about the art of ri...,neg,"[0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
869,1097,It 's just disappointingly superficial -- a mo...,it 's just disappointingly superficial -- a mo...,neg,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
870,1098,The title not only describes its main characte...,the title not only describes its main characte...,neg,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [39]:
test

Unnamed: 0,id,original_text,text,classification,rationale
0,1,If you sometimes like to go to the movies to h...,if you sometimes like to go to the movies to h...,pos,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, ..."
1,2,"Emerges as something rare , an issue movie tha...","emerges as something rare , an issue movie tha...",pos,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4,Offers that rare combination of entertainment ...,offers that rare combination of entertainment ...,pos,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0]"
3,5,Perhaps no picture ever made has more literall...,perhaps no picture ever made has more literall...,pos,"[0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, ..."
4,6,Steers turns in a snappy screenplay that curls...,steers turns in a snappy screenplay that curls...,pos,"[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...
1816,2205,An imaginative comedy/thriller .,an imaginative comedy/thriller .,pos,"[0.0, 1.0, 0.5, 0.5]"
1817,2206,"( A ) rare , beautiful film .","( a ) rare , beautiful film .",pos,"[0.5, 0.0, 0.5, 0.5, 0.0, 1.0, 0.0, 0.0]"
1818,2207,( An ) hilarious romantic comedy .,( an ) hilarious romantic comedy .,pos,"[0.5, 0.0, 0.5, 1.0, 0.0, 0.5, 0.0]"
1819,2208,Never ( sinks ) into exploitation .,never ( sinks ) into exploitation .,pos,"[0.5, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0]"
