In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from nltk.tokenize import TweetTokenizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
import random
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import re
import math

class MyDataset(Dataset):
    def __init__(self,dataset): 
        self.data = dataset
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        x = self.data[index][0]
        y = self.data[index][1]
        return x, y
    
def preprocess(token):
    if token.find('://') != -1:
        token = '@@URL'
    elif re.match('^\d+([,.]\d+)*$', token):
        token = '@@DIGIT'
    elif re.match('^\d{1,2}:\d{2}$', token):
        token = '@@TIME'
    elif re.match('^\d+/\d+(/\d+)?$', token):
        token = '@@DATE'
    else:
        token = str.lower(token)
    return token

# train = pd.read_csv('train.csv')
# train.fillna('<unknown>', inplace=True)
# train_df, val_df = train_test_split(train, test_size=0.1, train_size=None, shuffle=True)
train_df = pd.read_csv('train_df.csv')
val_df = pd.read_csv('val_df.csv')


pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 100)
tokenizer = TweetTokenizer()

device = 'cuda:6'

In [2]:
min_kwcount = 10
min_loccount = 10

le_kw = preprocessing.LabelEncoder().fit(train_df.keyword.tolist())
le_loc = preprocessing.LabelEncoder().fit(train_df.location.tolist())
loc = train_df.groupby('location').count().rename({'id':'cnt'}, axis=1)[['cnt']]
kw = train_df.groupby('keyword').count().rename({'id':'cnt'}, axis=1)[['cnt']]
loc = loc[loc['cnt']>min_loccount].index.tolist()
kw = kw[kw['cnt']>min_kwcount].index.tolist()

le_loc = preprocessing.LabelEncoder().fit(loc)
le_kw = preprocessing.LabelEncoder().fit(kw)

In [3]:
from transformers import AutoModel, AutoTokenizer
from TweetNormalizer import normalizeTweet

def create_dataset(df, MAX_LENGTH=53, le_kw=le_kw, le_loc=le_loc):
    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")
    data = []
    txt = []
    loc = []
    kw = []
    tg = []
    
    for i in df.itertuples():
        text = i[4]
        l = i[3]
        k = i[2]

        line = normalizeTweet(text)
        input_ids = tokenizer.encode(line)
        if len(input_ids) > MAX_LENGTH:
            continue
        elif len(input_ids) < MAX_LENGTH:
            input_ids += [1] * (MAX_LENGTH-len(input_ids))

        txt.append(input_ids)
        
        if l in le_loc.classes_:
            loc.append(l)
        else:
            loc.append('<unknown>')

        if k in le_kw.classes_:
            kw.append(k)
        else:
            kw.append('<unknown>')
            
        tg.append(i[5])
            
    kw = le_kw.transform(kw)
    loc = le_loc.transform(loc)
    
    for i in range(len(txt)):
        data.append(((torch.tensor(txt[i]).long().to(device),
                     torch.tensor(kw[i]).long().to(device), torch.tensor(loc[i]).long().to(device)),\
                     torch.tensor(tg[i]).to(device)))
    return data

train_dataset = MyDataset(create_dataset(train_df))
val_dataset = MyDataset(create_dataset(val_df))

In [4]:
BATCH_SIZE = 64

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

def validate(model):
    model.eval()
    running_loss = 0
    right = 0
    total = 0
    with torch.no_grad():
        for it, (batch_x, batch_y) in enumerate(val_loader):
            logits = model(batch_x)
            loss = criterion(logits, batch_y)
            running_loss += loss.item()
            top_n, top_i = logits.topk(1)
            right += (top_i[:,0] == batch_y).sum()
            total += batch_y.shape[0]
        print('validation loss:', running_loss/len(val_loader))
        print('validation accuracy:', float(right/total))
        
    return running_loss/len(val_loader)

In [16]:
class Model(nn.Module):
    def __init__(self, hid_feats):
        super().__init__()
        
        self.bertweet = AutoModel.from_pretrained("vinai/bertweet-large")
        self.linear0 = nn.Linear(1024, hid_feats)
        self.embed_kw = nn.Embedding(len(le_kw.classes_), 8)
        self.embed_loc = nn.Embedding(len(le_loc.classes_), 2)

        self.linear1 = nn.Linear(hid_feats+10, hid_feats)
        self.linear2 = nn.Linear(hid_feats, 2)
        
    def forward(self, inputs):
        with torch.no_grad():
            seq = self.bertweet(inputs[0])['last_hidden_state'][:, 0, :]
        seq = self.linear0(seq)
        kw = self.embed_kw(inputs[1])
        loc = self.embed_loc(inputs[2])
        
        output = F.leaky_relu(self.linear1(torch.cat([kw, loc, seq], 1)))
        output = self.linear2(output)
        return output

In [None]:
model = Model(hid_feats=50).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=5e-2)
criterion = nn.CrossEntropyLoss()
te = []
val = []
tr = []

for epoch in range(51):
    print('epoch:', epoch)
    model.train()
    running_loss = 0
    for it, (batch_x, batch_y) in enumerate(train_loader):
        logits = model(batch_x)
        loss = criterion(logits, batch_y)
        running_loss += loss.item()
        opt.zero_grad()
        loss.backward()
        opt.step()
        
    print('training loss:', math.sqrt(running_loss/len(train_loader)))
    tr.append(math.sqrt(running_loss/len(train_loader)))
    
    if True:
        val.append(validate(model))


loss_df = pd.DataFrame({'train':tr, 'val':val})
loss_df.plot()

Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

epoch: 0
training loss: 0.8124543443586176
validation loss: 0.5816100053489208
validation accuracy: 0.7023652791976929
epoch: 1
training loss: 0.750666686980552
validation loss: 0.5385225638747215
validation accuracy: 0.7463862895965576
epoch: 2
training loss: 0.724097254620267
validation loss: 0.5141331938405832
validation accuracy: 0.7509855031967163
epoch: 3
training loss: 0.6980373172152301
validation loss: 0.5202754127482573
validation accuracy: 0.7522996068000793
epoch: 4
training loss: 0.700171607044036
validation loss: 0.5558462378879389
validation accuracy: 0.7306175827980042
epoch: 5
training loss: 0.6886522748504896
validation loss: 0.5960165125628313
validation accuracy: 0.6754270792007446
epoch: 6
training loss: 0.6927375770930443
validation loss: 0.49717673535148305
validation accuracy: 0.7614980340003967
epoch: 7
training loss: 0.6951318191394437
validation loss: 0.4988584332168102
validation accuracy: 0.7674112915992737
epoch: 8
training loss: 0.6896525415241189
validat

In [35]:
pred = []
label = []
with torch.no_grad():
    for it, (batch_x, batch_y) in enumerate(val_loader):
        logits = model(batch_x)
        
        top_n, top_i = logits.topk(1)
        pred += top_i[:,0].cpu().numpy().tolist()
        label += batch_y.cpu().numpy().tolist()
result_df = pd.DataFrame({'pred':pred, 'label':label})

In [36]:
result_df

Unnamed: 0,pred,label
0,1,1
1,1,1
2,1,1
3,1,1
4,0,0
...,...,...
757,0,0
758,0,0
759,1,1
760,0,0


In [42]:
len(le.classes_), len(le_kw.classes_), len(le_loc.classes_)
        

(1356, 219, 30)

In [None]:
# from transformers import AutoModel, AutoTokenizer 

# bertweet = AutoModel.from_pretrained("vinai/bertweet-large")

# tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

# # INPUT TWEET IS ALREADY NORMALIZED!
# line = normalizeTweet(train.text[0])
# input_ids = torch.tensor([tokenizer.encode(line)])

# with torch.no_grad():
#     features = bertweet(input_ids)  # Models outputs are now tuples

# Ablation 1 fixed Bert parameters

In [9]:
model = Model(hid_feats=50).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-2, weight_decay=5e-2)
criterion = nn.CrossEntropyLoss()
te = []
val = []
tr = []

for epoch in range(21):
    print('epoch:', epoch)
    model.train()
    running_loss = 0
    for it, (batch_x, batch_y) in enumerate(train_loader):
        logits = model(batch_x)
        loss = criterion(logits, batch_y)
        running_loss += loss.item()
        opt.zero_grad()
        loss.backward()
        opt.step()
        
    print('training loss:', math.sqrt(running_loss/len(train_loader)))
    tr.append(math.sqrt(running_loss/len(train_loader)))
    
    if True:
        val.append(validate(model))


loss_df = pd.DataFrame({'train':tr, 'val':val})
loss_df.plot()

NameError: name 'Model' is not defined