In [2]:
! pip install finbert-embedding==0.1.4



In [3]:
import pandas as pd
import numpy as np
import random
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit

# Play around with Finbert library
https://pypi.org/project/finbert-embedding/

In [5]:
PATH = Path("/content/")

In [6]:
from finbert_embedding.embedding import FinbertEmbedding

In [7]:
finbert = FinbertEmbedding()

In [8]:
df = pd.read_csv("classification.csv",header=None, names=['sentence', 'label'])

In [9]:
df.head()

Unnamed: 0,sentence,label
0,"For example, we may record as regulatory asset...",c
1,"In connection with this transaction, all of ou...",c
2,The warrants were valued using the Black-Schol...,c
3,"In July of 2006, we entered into an operating ...",c
4,Certain regulatory assets do not result from c...,c


In [10]:
d = {v: i for i,v in enumerate(df.label.unique())}

In [11]:
d

{'c': 0, 'u': 1}

In [12]:
sentences = df.sentence.values
labels = df.label.values
labels_idx = [d[label] for label in labels]

In [13]:
word_embeddings_list = []
sentence_embeddings_list = []
for sentence in sentences:
    word_embeddings = finbert.word_vector(text = sentence)
    word_embeddings = np.vstack([emb.numpy() for emb in word_embeddings])
    word_embeddings_list.append(word_embeddings)
    sentence_embedding = finbert.sentence_vector(text = sentence)
    sentence_embedding = sentence_embedding.numpy()
    sentence_embeddings_list.append(sentence_embedding)

In [15]:
df[df.label == 'c'].count

<bound method DataFrame.count of                                               sentence label
0    For example, we may record as regulatory asset...     c
1    In connection with this transaction, all of ou...     c
2    The warrants were valued using the Black-Schol...     c
3    In July of 2006, we entered into an operating ...     c
4    Certain regulatory assets do not result from c...     c
..                                                 ...   ...
995  The 2000 volumes reflect the impact of additio...     c
996  The exercise price per share for each such opt...     c
997  As of June 30, 2006, the outstanding mortgage ...     c
998  The General Partner and the Commodity Broker p...     c
999  Clients pay for inclusion of their interviews ...     c

[870 rows x 2 columns]>

In [24]:
df[df.label == 'u'].count

<bound method DataFrame.count of                                               sentence label
14   As a result of its insurance policy coverage f...     u
16   A given automobile auction will employ both fu...     u
28   The FCC had tentatively concluded that it will...     u
34   The Partnership is currently evaluating the ca...     u
38   In connection with this sale, we were required...     u
..                                                 ...   ...
957  We believe the following accounting estimates ...     u
962  Retained control by our principal stockholders...     u
984  It is possible that Pinnacle may choose to ele...     u
988  Purported classes include individuals claiming...     u
992  Because of the imposition of the foregoing add...     u

[130 rows x 2 columns]>

In [30]:
df["sent_emb"] = word_embeddings_list
df["sent_avg_emb"] = sentence_embeddings_list
df["label"] = labels_idx

In [31]:
df

Unnamed: 0,sentence,label,sent_emb,sent_avg_emb
0,"For example, we may record as regulatory asset...",0,"[[-3.0068293, 2.1747842, 0.08535655, -1.333302...","[-0.07628794, 0.07385665, 0.22439034, 0.113049..."
1,"In connection with this transaction, all of ou...",0,"[[-7.2349834, 0.7282898, 0.44459042, -3.69149,...","[-0.2181889, 0.25167054, 0.21129918, 0.0914831..."
2,The warrants were valued using the Black-Schol...,0,"[[-4.7428164, -2.0988667, 0.5116024, -0.578443...","[-0.28901023, 0.14081945, 0.17879683, 0.128653..."
3,"In July of 2006, we entered into an operating ...",0,"[[-9.793489, -0.65220696, 0.13868378, 1.696403...","[-0.26589411, 0.29738006, 0.2962963, -0.096300..."
4,Certain regulatory assets do not result from c...,0,"[[-3.8283665, 4.249455, -0.1129528, -0.4282031...","[-0.24734896, 0.32598937, 0.28862736, 0.164214..."
...,...,...,...,...
995,The 2000 volumes reflect the impact of additio...,0,"[[-7.4123354, 0.45839158, -0.9591343, -0.77836...","[-0.65662414, 0.1849709, 0.38781762, -0.004934..."
996,The exercise price per share for each such opt...,0,"[[-1.9446385, -1.0628954, -0.9985801, -1.54941...","[-0.33660322, 0.20990066, 0.1513533, 0.1885641..."
997,"As of June 30, 2006, the outstanding mortgage ...",0,"[[-10.131733, -0.16516311, 1.8880061, -1.89035...","[-0.6541173, 0.2995038, 0.25971296, -0.0529209..."
998,The General Partner and the Commodity Broker p...,0,"[[-5.572422, 2.187811, -0.925772, -1.2603581, ...","[-0.2163732, 0.26890123, 0.09922831, -0.040265..."


In [32]:
max = 0
min = float("inf")
for i in word_embeddings_list:
    if i.shape[0] > max:
        max = i.shape[0]
    if i.shape[0] < min:
        min = i.shape[0]
print(max)
print(min)

172
6


In [33]:
# Add padding to sent_emb
padded_embeddings_list = []
for emb in word_embeddings_list:
    if emb.shape[0] != max:
        tmp = np.vstack((emb,np.zeros((max-emb.shape[0],768))))
        padded_embeddings_list.append(tmp)
    else:
        padded_embeddings_list.append(emb)

In [34]:
# Check
padded_embeddings_list[7].shape

(172, 768)

In [35]:
df["padded_sent"] = padded_embeddings_list

In [36]:
df

Unnamed: 0,sentence,label,sent_emb,sent_avg_emb,padded_sent
0,"For example, we may record as regulatory asset...",0,"[[-3.0068293, 2.1747842, 0.08535655, -1.333302...","[-0.07628794, 0.07385665, 0.22439034, 0.113049...","[[-3.006829261779785, 2.1747841835021973, 0.08..."
1,"In connection with this transaction, all of ou...",0,"[[-7.2349834, 0.7282898, 0.44459042, -3.69149,...","[-0.2181889, 0.25167054, 0.21129918, 0.0914831...","[[-7.234983444213867, 0.728289783000946, 0.444..."
2,The warrants were valued using the Black-Schol...,0,"[[-4.7428164, -2.0988667, 0.5116024, -0.578443...","[-0.28901023, 0.14081945, 0.17879683, 0.128653...","[[-4.74281644821167, -2.0988667011260986, 0.51..."
3,"In July of 2006, we entered into an operating ...",0,"[[-9.793489, -0.65220696, 0.13868378, 1.696403...","[-0.26589411, 0.29738006, 0.2962963, -0.096300...","[[-9.793489456176758, -0.6522069573402405, 0.1..."
4,Certain regulatory assets do not result from c...,0,"[[-3.8283665, 4.249455, -0.1129528, -0.4282031...","[-0.24734896, 0.32598937, 0.28862736, 0.164214...","[[-3.82836651802063, 4.249454975128174, -0.112..."
...,...,...,...,...,...
995,The 2000 volumes reflect the impact of additio...,0,"[[-7.4123354, 0.45839158, -0.9591343, -0.77836...","[-0.65662414, 0.1849709, 0.38781762, -0.004934...","[[-7.412335395812988, 0.45839157700538635, -0...."
996,The exercise price per share for each such opt...,0,"[[-1.9446385, -1.0628954, -0.9985801, -1.54941...","[-0.33660322, 0.20990066, 0.1513533, 0.1885641...","[[-1.9446384906768799, -1.06289541721344, -0.9..."
997,"As of June 30, 2006, the outstanding mortgage ...",0,"[[-10.131733, -0.16516311, 1.8880061, -1.89035...","[-0.6541173, 0.2995038, 0.25971296, -0.0529209...","[[-10.131732940673828, -0.16516311466693878, 1..."
998,The General Partner and the Commodity Broker p...,0,"[[-5.572422, 2.187811, -0.925772, -1.2603581, ...","[-0.2163732, 0.26890123, 0.09922831, -0.040265...","[[-5.572422027587891, 2.1878108978271484, -0.9..."


## Split data in a balanced way

In [40]:
train = df[df.label==0].iloc[:695,:]

In [41]:
valid = df[df.label==0].iloc[695:783,:]

In [42]:
test = df[df.label==0].iloc[783:,:]

In [43]:
print(train.shape,valid.shape,test.shape)

(695, 5) (88, 5) (87, 5)


In [44]:
train1 = df[df.label==1].iloc[:104,:]

In [45]:
valid1 = df[df.label==1].iloc[104:117,:]

In [46]:
test1 = df[df.label==1].iloc[117:,:]

In [47]:
print(train1.shape,valid1.shape,test1.shape)

(104, 5) (13, 5) (13, 5)


In [49]:
train_final = pd.concat([train, train1], ignore_index=True)
valid_final = pd.concat([valid, valid1], ignore_index=True)
test_final = pd.concat([test, test1], ignore_index=True)

In [50]:
print(train_final.shape,valid_final.shape,test_final.shape)

(799, 5) (101, 5) (100, 5)


In [51]:
train_final.head(3)

Unnamed: 0,sentence,label,sent_emb,sent_avg_emb,padded_sent
0,"For example, we may record as regulatory asset...",0,"[[-3.0068293, 2.1747842, 0.08535655, -1.333302...","[-0.07628794, 0.07385665, 0.22439034, 0.113049...","[[-3.006829261779785, 2.1747841835021973, 0.08..."
1,"In connection with this transaction, all of ou...",0,"[[-7.2349834, 0.7282898, 0.44459042, -3.69149,...","[-0.2181889, 0.25167054, 0.21129918, 0.0914831...","[[-7.234983444213867, 0.728289783000946, 0.444..."
2,The warrants were valued using the Black-Schol...,0,"[[-4.7428164, -2.0988667, 0.5116024, -0.578443...","[-0.28901023, 0.14081945, 0.17879683, 0.128653...","[[-4.74281644821167, -2.0988667011260986, 0.51..."


# Dataset

In [52]:
from torch.utils.data import TensorDataset, DataLoader

In [53]:
class EmbeddingDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return df.shape[0]

    def __getitem__(self, idx):
        # cannot return in the form of numpy array
        x1 = self.df["padded_sent"][idx]
        x2 = self.df["sent_avg_emb"][idx]
        y = self.df["label"][idx]
        return x1,x2,y

In [55]:
my_x = train_final["padded_sent"]
my_y = train_final["label"]

tensor_x = torch.Tensor(my_x) 
tensor_y = torch.Tensor(my_y)

train_ds = TensorDataset(tensor_x,tensor_y) 
train_dl = DataLoader(train_ds) 

In [56]:
my_x = valid_final["padded_sent"]
my_y = valid_final["label"]

tensor_x = torch.Tensor(my_x) 
tensor_y = torch.Tensor(my_y)

valid_ds = TensorDataset(tensor_x,tensor_y) 
valid_dl = DataLoader(valid_ds) 

In [57]:
my_x = test_final["padded_sent"]
my_y = test_final["label"]

tensor_x = torch.Tensor(my_x) 
tensor_y = torch.Tensor(my_y)

test_ds = TensorDataset(tensor_x,tensor_y) 
test_dl = DataLoader(test_ds) 

In [None]:
# train_ds = EmbeddingDataset(train_df)
# valid_ds = EmbeddingDataset(valid_df)
# test_ds = EmbeddingDataset(test_df)

# Model and Training

In [58]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [59]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [60]:
class GRUModel(torch.nn.Module) :
    def __init__(self, embedding_dim, hidden_dim):
        super(GRUModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        # self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        out_pack, ht = self.gru(x)
        return self.linear(ht[-1])

In [61]:
def train_epocs(model, optimizer, train_dl, valid_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        best_val_acc = 0
        for x, y in train_dl:
            x = x.long() #.cuda()
            y = y.float() #.cuda()
            y_pred = model(x.float())
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            path = "{0}/models/model_acc_{1:.0f}.pth".format(PATH,100*val_acc) 
            save_model(model, path)
            print(path)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [62]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in valid_dl:
        x = x.long() #.cuda()
        y = y.float().unsqueeze(1) #.cuda()
        y_hat = model(x.float())
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [63]:
batch_size = 10
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [64]:
model = GRUModel(embedding_dim=768, hidden_dim=15) #.cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [65]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=10)

/content/models/model_acc_87.pth
/content/models/model_acc_87.pth
train loss 0.388 val loss 0.385 and val accuracy 0.871
/content/models/model_acc_87.pth
/content/models/model_acc_87.pth
/content/models/model_acc_87.pth
/content/models/model_acc_87.pth
/content/models/model_acc_87.pth
train loss 0.390 val loss 0.384 and val accuracy 0.871
/content/models/model_acc_87.pth
/content/models/model_acc_87.pth
/content/models/model_acc_87.pth


In [None]:
train_epocs(model, optimizer, train_dl, valid_dl, epochs=30)

/content/models/model_acc_86.pth
/content/models/model_acc_92.pth
train loss 0.271 val loss 0.295 and val accuracy 0.919
/content/models/model_acc_91.pth
/content/models/model_acc_89.pth
/content/models/model_acc_87.pth
/content/models/model_acc_90.pth
/content/models/model_acc_90.pth
train loss 0.232 val loss 0.215 and val accuracy 0.899
/content/models/model_acc_91.pth
/content/models/model_acc_91.pth
/content/models/model_acc_91.pth
/content/models/model_acc_90.pth
/content/models/model_acc_91.pth
train loss 0.123 val loss 0.312 and val accuracy 0.909
/content/models/model_acc_91.pth
/content/models/model_acc_89.pth
/content/models/model_acc_90.pth
/content/models/model_acc_90.pth
/content/models/model_acc_90.pth
train loss 0.135 val loss 0.302 and val accuracy 0.899
/content/models/model_acc_91.pth
/content/models/model_acc_89.pth
/content/models/model_acc_88.pth
/content/models/model_acc_91.pth
/content/models/model_acc_92.pth
train loss 0.147 val loss 0.252 and val accuracy 0.919

In [66]:
load_model(model, "/content/models/model_acc_93.pth")

In [67]:
val_metrics(model, test_dl)

(0.11411478444933891, tensor(0.9700))

In [68]:
prediction = []

In [92]:
def test_metrics(model, test_dl):
    prediction = []
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in test_dl:
        x = x.long() #.cuda()
        y = y.float().unsqueeze(1) #.cuda()
        y_hat = model(x.float())
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        prediction.append((y_pred.float() == y).float().squeeze(1).numpy())
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return prediction,correct/total

In [95]:
result,measure = test_metrics(model, test_dl)

In [96]:
measure

tensor(0.9700)

In [84]:
result_np = np.hstack(result)

In [89]:
where_0 = np.where(result_np == 0)
where_1 = np.where(result_np == 1)

result_np[where_0] = 1
result_np[where_1] = 0

In [86]:
from sklearn.metrics import classification_report

In [90]:
print(classification_report(my_y, result_np))

              precision    recall  f1-score   support

           0       0.89      0.99      0.93        87
           1       0.67      0.15      0.25        13

    accuracy                           0.88       100
   macro avg       0.78      0.57      0.59       100
weighted avg       0.86      0.88      0.85       100

