In [None]:
#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
import re

In [None]:
import pandas as pd
data = pd.read_csv('/content/task_1_information_extraction_train.tsv', sep = '\t')
print(len(data))

In [None]:
# Removies Null rows and reseting Index
data = data[data['NOTES'].notna()]
data = data.reset_index(drop=True)

In [None]:
# Event Type, Data Processing
zero_numbering_event = {}
for i in range(len(data['EVENT_TYPE'].unique())):
    cls = data['EVENT_TYPE'].unique()[i]
    zero_numbering_event[cls] = i

data['EVENT'] = data['EVENT_TYPE'].apply(lambda x: zero_numbering_event[x])

In [None]:
# Sub Event Type, Data Processing
zero_numbering_event_sub = {}
for i in range(len(data['SUB_EVENT_TYPE'].unique())):
    cls = data['SUB_EVENT_TYPE'].unique()[i]
    zero_numbering_event_sub[cls] = i

data['SUB_EVENT'] = data['SUB_EVENT_TYPE'].apply(lambda x: zero_numbering_event_sub[x])

In [None]:
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [None]:
counts = Counter()
docs = {}
for i in range(len(data['NOTES'])):
    docs[i] = tokenize(data['NOTES'][i])
    counts.update(tokenize(data['NOTES'][i]))

In [None]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [None]:
data['encoded'] = data['NOTES'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [None]:
X = list(data['encoded'])
y_event = list(data['EVENT'])
y_sub_event = list(data['SUB_EVENT'])
from sklearn.model_selection import train_test_split
X_train_event, X_valid_event, y_train_event, y_valid_event = train_test_split(X, y_event, test_size=0.3)

X_train_sub_event, X_valid_sub_event, y_train_sub_event, y_valid_sub_event = train_test_split(X, y_sub_event, test_size=0.3)

In [None]:
train_ds_event = ReviewsDataset(X_train_event, y_train_event)
valid_ds_event = ReviewsDataset(X_valid_event, y_valid_event)

train_ds_sub_event = ReviewsDataset(X_train_sub_event, y_train_sub_event)
valid_ds_sub_event = ReviewsDataset(X_valid_sub_event, y_valid_sub_event)

In [None]:
def train_model(train_dl,val_dl, model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 2 == 0:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [None]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, classes) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional = True)
        self.linear = nn.Linear(hidden_dim, classes)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [None]:
batch_size = 5000
vocab_size = len(words)

train_dl_event = DataLoader(train_ds_event, batch_size=batch_size, shuffle=True)
val_dl_event = DataLoader(valid_ds_event, batch_size=batch_size)

train_dl_sub_event = DataLoader(train_ds_sub_event, batch_size=batch_size, shuffle=True)
val_dl_sub_event = DataLoader(valid_ds_sub_event, batch_size=batch_size)

In [None]:

print(len(data['EVENT'].unique()))
print(len(data['SUB_EVENT'].unique()))


In [None]:
model_event =  LSTM_fixed_len(vocab_size, 50, 50, 6)
model_sub_event =  LSTM_fixed_len(vocab_size, 50, 50, 25)

In [None]:
train_model(train_dl_event,val_dl_event, model_event, epochs=50, lr=0.01)

In [None]:
path = F"/content/gdrive/My Drive/Event.pt"
torch.save(model_event.state_dict(), path)

In [None]:
train_model(train_dl_sub_event,val_dl_sub_event, model_sub_event, epochs=50, lr=0.01)

In [None]:
path = F"/content/gdrive/My Drive/Sub_Event.pt"
torch.save(model_sub_event.state_dict(), path)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
batch_size = 5000
vocab_size = len(words)

model_event_test =  LSTM_fixed_len(vocab_size, 50, 50, 6)
model_sub_event_test =  LSTM_fixed_len(vocab_size, 50, 50, 25)

In [None]:
testdata = pd.read_csv('/content/task_1_information_extraction_valid.tsv', sep = '\t')
# testdata = testdata

# testdata['NOTES'] = testdata['NOTES'].fillna('')
testdata = testdata[testdata['NOTES'].notna()]
testdata = testdata.reset_index(drop=True)
testdata['encoded'] = testdata['NOTES'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))


In [None]:
testdata.head()

In [None]:
excerpts_test = testdata['encoded']
X_test = [i[0] for i in excerpts_test]
l_test = [i[1] for i in excerpts_test]
X_test = torch.LongTensor(X_test)

In [None]:
y_event_test = model_event_test(X_test, l_test)

y_sub_event_test = model_sub_event_test(X_test, l_test)


In [None]:
def Final(y_hat):
    Final = []
    for i in y_hat:
        Final.append(torch.argmax(i).item())
    return Final

In [None]:
Final_Events_test = Final(y_event_test)
Final_Sub_Events_test = Final(y_sub_event_test)

In [None]:
print("Predicted Events test : ", Counter(Final_Events_test))

print("Predicted Sub Events test : ", Counter(Final_Sub_Events_test))

In [None]:
#changing ratings to 0-numbering
testdata['EVENT'] = testdata['EVENT_TYPE'].apply(lambda x: zero_numbering_event[x])
testdata['SUB_EVENT'] = testdata['SUB_EVENT_TYPE'].apply(lambda x: zero_numbering_event_sub[x])

In [None]:
print("Actual Events : ", Counter(testdata['EVENT']))
print("Actual Sub Events : ", Counter(testdata['SUB_EVENT']))

In [None]:
from sklearn.metrics import f1_score

print("Events F1 Score :", f1_score(testdata['EVENT'], Final_Events_test, average='macro'))
print("Sub Events test F1 Score :", f1_score(testdata['SUB_EVENT'], Final_Sub_Events_test, average='macro'))