In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook,tnrange
from random import shuffle
import time
from torch.utils.data import Dataset,DataLoader,RandomSampler,TensorDataset

In [2]:
path = "/root/sentiment/emotions/input/emotions.csv"

In [3]:
save_path = "/root/sentiment/LM/"

In [4]:
max_len = 40
max_pred = 5
min_freq = 0

In [5]:
data = pd.read_csv(path)

In [6]:
data.text = data.text.apply(lambda x: x.split(' '))

In [7]:
word_freq = {}
for doc in data.text:
    for token in doc:
        try:
            word_freq[token]+=1
        except:
            word_freq[token]=1
            
w2i={"[PAD]":0,"[MASK]":1,"[UNK]":2}
for word,count in word_freq.items():
    if count>=min_freq:
        w2i[word]=len(w2i)
i2w = {i:w for w,i in w2i.items()}

In [8]:
w2i["[PAD]"],w2i["[MASK]"],w2i["[UNK]"]

(0, 1, 2)

In [9]:
data.text = data.text.apply(lambda x: [w2i[i] if i in w2i else w2i["[UNK]"] for i in x])

In [10]:
docs = data.text

In [11]:
def get_lm_data(docs):
    batch=[]
    for doc in docs:
        if len(doc)>5:
            doc = doc[:max_len]
            n_pred = min(max_pred,max(1,int(len(doc)*0.15)))
            cand_mask_pos = [i+1 for i,token in enumerate(doc[1:]) if token!=w2i["[UNK]"]]
            shuffle(cand_mask_pos)
            masked_tokens,masked_pos=[],[]
            for pos in cand_mask_pos[:n_pred]:
                masked_pos.append(pos)
                masked_tokens.append(doc[pos])
                doc[pos]=w2i["[MASK]"]
            n_pad = max_len - len(doc)
            doc.extend([w2i["[PAD]"]]*n_pad)
            n_pad = max_pred-n_pred
            masked_pos.extend([0]*n_pad)
            batch.append([doc,masked_pos,masked_tokens])
    return batch

In [12]:
class LabelConversion:
    def fit_transform(self,x):
        self.fit(x)
        return self.transform(x)
    
    def fit_by_list(self,x):
        self.labels={}
        for l in x:
            if l not in self.labels:
                self.labels[l]=len(self.labels)
        self.inverse_labels = {i:l for l,i in self.labels.items()}
    
    def fit(self,x):
        self.labels={}
        for l in x:
            if l not in self.labels:
                self.labels[l]=len(self.labels)
        self.inverse_labels = {i:l for l,i in self.labels.items()}
    def transform(self,x): 
        output=[]
        for i,l in enumerate(x):
            output.append(self.labels[l])
        return output
                          
    def inverse_transform(self,output):
        result=[]
        for label in output:
            result.append(self.inverse_labels[label])
        return result

In [13]:
lc=LabelConversion()
lc.fit(data.labels)

In [14]:
labels = lc.transform(data.labels)

In [15]:
def get_class_data(docs,labels):
    batch=[]
    for doc,label in zip(docs,labels):
        doc = doc[:max_len]
        n_pad = max_len - len(doc)
        doc.extend([w2i["[PAD]"]]*n_pad)
        batch.append([doc,label])
    return batch

In [16]:
class Encoder(nn.Module):
    def __init__(self,vs,emb_dim,hid_dim):
        super().__init__()
        self.emb = nn.Embedding(vs,emb_dim,padding_idx=0)
        self.enc = nn.LSTM(emb_dim,hid_dim,batch_first=True)
        
    def forward(self,x):
        x = self.emb(x)
        hiddens,(h,c) = self.enc(x)
        return hiddens

In [17]:
class Attention(nn.Module):
    def __init__(self,hid_dim):
        super().__init__()
        self.Wh = nn.Linear(hid_dim,hid_dim,bias=False)
        self.W = nn.Linear(hid_dim,1,bias=False)
        
    def forward(self,hiddens):
        x = self.Wh(hiddens)
        x = self.W(x)
        x = x.softmax(dim=1)
        c = x*hiddens
        return c,x

In [18]:
class Decoder(nn.Module):
    def __init__(self,hid_dim,vs,enc_weight=None):
        super().__init__()
        self.hid_dim = hid_dim
        self.fc = nn.Linear(hid_dim,vs)
        
    def forward(self,c,masked_pos):
        bs,mask_len = masked_pos.size()
        masked_index=(masked_pos!=0).view(-1)
        masked_pos = masked_pos[:,:,None].expand(-1,-1,self.hid_dim)
        output = torch.gather(c,1,masked_pos)
        output = output.view(bs*mask_len,-1)[masked_index]
        logits = self.fc(output)
        return logits

In [19]:
class LanguageModel(nn.Module):
    def __init__(self,encoder,decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self,x,masked_pos):
        c,s = self.encoder(x)
        logits = self.decoder(c,masked_pos)
        return logits

In [20]:
class Classifier(nn.Module):
    def __init__(self,hid_dim,int_dim,num_classes):
        super().__init__()
        self.layer = nn.Linear(hid_dim,int_dim)
        self.clf = nn.Linear(int_dim,num_classes)
        
    def forward(self,c):
        context = c.sum(1)
        hidden = self.layer(context)
        logits = self.clf(hidden)
        return logits

In [21]:
class ClassificationModel(nn.Module):
    def __init__(self,encoder,classifier):
        super().__init__()
        self.encoder = encoder
        self.classifier = classifier
        
    def forward(self,x):
        c,s = self.encoder(x)
        logits = self.classifier(c)
        return logits

In [22]:
emb_dim = 300
vs = len(w2i)
num_classes = len(lc.labels)

In [45]:
lm_encoder = Encoder(vs,emb_dim,emb_dim)
lm_attention = Attention(emb_dim)
decoder = Decoder(emb_dim,vs,lm_encoder.emb.weight)

lm_model = nn.Sequential(lm_encoder,lm_attention)

lang_model = LanguageModel(lm_model,decoder)

In [46]:
device="cpu"
if torch.cuda.is_available():
    device="cuda"

In [47]:
lang_model.to(device)
1

1

In [28]:
token_ids,masked_positions,targets = list(zip(*get_lm_data(docs)))

In [29]:
class LMDataset(Dataset):
    def __init__(self,token_ids,m_pos,y):
        self.x=torch.LongTensor(token_ids)
        self.masked_pos=torch.LongTensor(m_pos)
        self.y=y
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self,i):
        return self.x[i],self.masked_pos[i],self.y[i]

In [30]:
class LMDataLoader(DataLoader):
    def __init__(self,dataset,bs,**kwargs):
        super().__init__(dataset,batch_size=bs,**kwargs)
        
    def __iter__(self):
        for i in range(0,len(self.dataset),self.batch_size):
            yield self.dataset[i:i+self.batch_size]

In [31]:
lm_ds = LMDataset(token_ids,masked_positions,targets)

In [32]:
lm_sampler = RandomSampler(lm_ds)

In [33]:
bs = 128

In [34]:
lm_dl = LMDataLoader(lm_ds,bs=bs,sampler=lm_sampler)

In [48]:
epochs=10

In [49]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(lang_model.parameters(),lr=1e-3)

In [50]:
for epoch in tnrange(epochs,desc="Epochs"):
    e_loss = 0
    itr = 0
    t = tqdm_notebook(lm_dl,leave=False,total=len(lm_dl))
    for xb,masked_pos,yb in t:
        yb = torch.cat([torch.tensor(i) for i in yb])
        if device=='cuda':
            xb = xb.cuda()
            masked_pos= masked_pos.cuda()
            yb = yb.cuda()
        optimizer.zero_grad()
        logits=lang_model(xb,masked_pos)
        loss = loss_fn(logits,yb)
        loss.backward()
        optimizer.step()
        t.set_postfix(loss=loss.item())
        e_loss+=(loss.item()*len(xb))
        itr+=len(xb)
    print("Epoch: {} Loss: {}".format(epoch+1,e_loss/itr))

HBox(children=(IntProgress(value=0, description='Epochs', max=10, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 0 Loss: 6.216586980340428


HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 1 Loss: 5.403260256562978


HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 2 Loss: 5.111074184894331


HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 3 Loss: 4.92146635501794


HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 4 Loss: 4.762309688243205


HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 5 Loss: 4.620094018723997


HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 6 Loss: 4.487770601862929


HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 7 Loss: 4.361287543380702


HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 8 Loss: 4.240156300983761


HBox(children=(IntProgress(value=0, max=3089), HTML(value='')))

Epoch: 9 Loss: 4.124820207000221



In [51]:
torch.save(lang_model,path+"lm_model.pkl")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [52]:
import joblib

In [55]:
joblib.dump(lc,save_path + "label_obj.pkl")
joblib.dump(w2i,save_path + "w2i.pkl")
joblib.dump(i2w,save_path + "w2i.pkl")

['/root/sentiment/LM/w2i.pkl']

## Classification:

In [56]:
token_ids,targets = zip(*get_class_data(docs,labels))

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
xtrain,xtest,ytrain,ytest = train_test_split(token_ids,targets)

In [59]:
class ClassDataset(Dataset):
    def __init__(self,x,y):
        self.x = torch.LongTensor(x)
        self.y = torch.LongTensor(y)
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self,i):
        return self.x[i],self.y[i]

In [60]:
ds = ClassDataset(xtrain,ytrain)

In [61]:
sampler=RandomSampler(ds)

In [78]:
bs=128

In [79]:
dl = DataLoader(ds,batch_size=bs,sampler=sampler)

In [88]:
encoder = Encoder(vs,emb_dim,emb_dim)
attention = Attention(emb_dim)
classifier = Classifier(emb_dim,emb_dim,num_classes)

In [89]:
model = nn.Sequential(encoder,attention)

In [90]:
model.load_state_dict(lm_model.state_dict(),strict=False)

<All keys matched successfully>

In [91]:
clf_model = ClassificationModel(model,classifier)

In [92]:
clf_model.to(device)
1

1

In [93]:
epochs=5

In [94]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(clf_model.parameters(),lr=1e-3)

In [103]:
for epoch in tnrange(epochs,desc="Epochs"):
    e_loss = 0
    itr = 0
    t = tqdm_notebook(dl,leave=False,total=len(dl))
    for xb,yb in t:
        if device=='cuda':
            xb = xb.cuda()
            yb = yb.cuda()
        optimizer.zero_grad()
        logits=clf_model(xb)
        loss = loss_fn(logits,yb)
        loss.backward()
        optimizer.step()
        t.set_postfix(loss=loss.item())
        e_loss+=loss.item()*len(xb)
        itr+=len(xb)
    print("Epoch: {} Loss: {}".format(epoch+1,e_loss/itr))

HBox(children=(IntProgress(value=0, description='Epochs', max=5, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, max=2443), HTML(value='')))

Epoch: 1 Loss: 0.08747265762774693


HBox(children=(IntProgress(value=0, max=2443), HTML(value='')))

Epoch: 2 Loss: 0.08509520569857858


HBox(children=(IntProgress(value=0, max=2443), HTML(value='')))

Epoch: 3 Loss: 0.08421211934886262


HBox(children=(IntProgress(value=0, max=2443), HTML(value='')))

Epoch: 4 Loss: 0.0828182890452193


HBox(children=(IntProgress(value=0, max=2443), HTML(value='')))

Epoch: 5 Loss: 0.08173147901538087



In [104]:
test_ds = ClassDataset(xtest,ytest)
test_dl = DataLoader(test_ds,batch_size=bs)

In [105]:
t = tqdm_notebook(test_dl,leave=False,total=len(test_dl))
preds=[]
actual=[]
with torch.no_grad():
    for xb,yb in t:
        if device=='cuda':
            xb = xb.cuda()
            yb = yb.cuda()
        logits=clf_model(xb)
        preds.append(logits.cpu())
        actual.append(yb.cpu())

HBox(children=(IntProgress(value=0, max=815), HTML(value='')))

In [106]:
preds = torch.cat(preds)
actual = torch.cat(actual)

In [107]:
preds = preds.softmax(-1)

In [108]:
yhat = preds.argmax(1)

In [109]:
for k in range(len(lc.labels)):
    tp,tn,fp,fn = 0,0,0,0
    for i,j in zip(yhat,actual):
        if i==k and j==k:
            tp+=1
        if i!=k and j==k:
            fn+=1
        if i==k and j!=k:
            fp+=1
        if i!=k and j!=k:
            tn+=1
    try:
        p = tp/(tp+fp)
    except:
        p = 0
    try:
        r = tp/(tp+fn)
    except:
        r = 0
    print(lc.inverse_labels[k])
    print("Precision: {}\nRecall: {}\n*****".format(p,r))
    

sadness
Precision: 0.9811513684856334
Recall: 0.9567905338030978
*****
joy
Precision: 0.9742959119402538
Recall: 0.9171607657123496
*****
love
Precision: 0.7679495624353063
Recall: 0.9429231658001156
*****
anger
Precision: 0.9143064633260711
Recall: 0.9616025552006666
*****
fear
Precision: 0.8845809236784231
Recall: 0.906813627254509
*****
surprise
Precision: 0.8027073732718893
Recall: 0.7602291325695582
*****


In [110]:
torch.save(clf_model,path+"clf_model.pkl")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
