In [None]:
#to print all output for a cell instead of only last one 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import and fixed seed 

In [None]:
import os

import torch
import random
import numpy as np
import pandas as pd
import pickle 
import string

import gensim
import gensim.downloader as gloader

import time 

# typing
from typing import Dict


torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [None]:
print("Current work directory: {}".format(os.getcwd()))

data_folder = os.path.join(os.getcwd(),"data")

download data (dataset and glove)

In [None]:
#source of this code -> https://gist.github.com/hantoine/c4fc70b32c2d163f604a8dc2a050d5f6 

from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile


def download_and_unzip_dataset():

    dataset_folder = os.path.join(data_folder,"dependency_treebank")

    url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

    if not os.path.exists(dataset_folder):
        print('downloading and extracting dataset to :',dataset_folder)
        with urlopen(url) as response:
            zipfile = ZipFile(BytesIO(response.read()))
            zipfile.extractall(path=data_folder)
    else :
        print("the dataset has been already downloaded")

In [None]:
#encode dataset in pandas dataframe 

def encode_dataset(dataset_name: str) -> pd.DataFrame:

    print("Encoding dataset as pandas dataframe...")

    dataset_folder = os.path.join(data_folder,"dependency_treebank")
    
    dataframe_rows = []             #dataframe that will contain all the sentences in all the documents, each sentence as a list of word and a list of corresponding tags
    unique_tags = set()
    unique_words = set()

    for doc in os.listdir(dataset_folder):
      doc_num = int(doc[5:8])
      doc_path = os.path.join(dataset_folder,doc)

      with open(doc_path, mode='r', encoding='utf-8') as file:
        df = pd.read_csv(file,sep='\t',header=None,skip_blank_lines=False)
        df.rename(columns={0:'word',1:"TAG",2:"remove"},inplace=True)
        df.drop("remove",axis=1,inplace=True)
        
        #create another column that indicate group by sentence 
        df["group_num"] = df.isnull().all(axis=1).cumsum()
        df.dropna(inplace=True)
        df.reset_index(drop=True, inplace=True)
        
        unique_tags.update(df['TAG'].unique())     #save all the unique tags in a set 
        unique_words.update(df['word'].unique())   #save all the unique words in a set 

        #generate sentence list in a document 
        df_list = [df.iloc[rows] for _, rows in df.groupby('group_num').groups.items()]
        for n,d in enumerate(df_list) :           #for each sentence create a row in the final dataframe
            dataframe_row = {
                "split" : 'train' if doc_num<=100 else ('val' if doc_num<=150  else 'test'),
                "doc_id" : doc_num,
                "sentence_num" : n,
                "words": d['word'].tolist(),
                "tags":  d['TAG'].tolist(),
                "num_tokens": len(d['word'])
            }
            dataframe_rows.append(dataframe_row)

    dataframe_path = os.path.join(data_folder, dataset_name)
    df_final = pd.DataFrame(dataframe_rows)
    df_final.to_csv(dataframe_path + ".csv")                      #save as csv to inspect

    print("Encoding completed!")
      
    return  df_final, unique_tags, unique_words

In [None]:
from collections import OrderedDict

#build the dictionaries that will be used for the embedding matrix and one hot encoding of TAGS
#starting from 1, 0 is reserved to PAD

def build_dict(words : list[str], tags : list[str]): 
    
    word2int = OrderedDict()
    int2word = OrderedDict()

    for i, word in enumerate(words):
        word2int[word] = i+1
        int2word[i+1] = word

    tag2int = OrderedDict()
    int2tag = OrderedDict()

    for i, tag in enumerate(tags):
        tag2int[tag] = i+1
        int2tag[i+1] = tag
    
    print('saving dictionaries as pickle files')
    pickle_files = [word2int,int2word,tag2int,int2tag]
    files_path = os.path.join(data_folder,'dictionaries.pkl')
    with open(files_path, 'wb') as f:
        pickle.dump(pickle_files, f)

    return word2int,int2word,tag2int,int2tag

In [None]:
#TODO: cosa fa ? 
def build_tokenized_dataframe(word2int: Dict,tag2int: Dict, df : pd.DataFrame):

    print('Initiating tokenization of words and tags in dataframe')
    tokenized_rows = []
    for words,tags in zip(df['words'],df['tags']):
        tokenized_row = {'words_token':[word2int[word] for word in words ],'tags_token':[tag2int[tag] for tag in tags ]}
        tokenized_rows.append(tokenized_row)
    
    tokenized_df = pd.DataFrame(tokenized_rows)

    tokenized_df.insert(0,'split',df['split'])
    tokenized_df.insert(1,'num_tokens',df['num_tokens'])

    print('Tokenization completed')

    return tokenized_df

#TODO: cosa fa ?
def check_dataframe_tokenization(tokenized_df, normal_df, int2word, int2tag) :

    for n, (w_t, t_t) in enumerate(zip(tokenized_df['words_token'],tokenized_df['tags_token'])):
        if not normal_df.loc[n,'words'] == [int2word[word_token] for word_token in w_t]:
            print('words tokenization gone wrong') 
            return False
        if not normal_df.loc[n,'tags'] == [int2tag[tag_token] for tag_token in t_t]:
            print('tags tokenization gone wrong')
            return False 
    
    print('all right with dataset tokenization')
    print('saving tokenized dataframe')
    path = os.path.join(data_folder, "token_dataset")
    tokenized_df.to_pickle(path+'.pkl')


In [None]:
def download_glove_emb():   
    
    print('downloading glove embeddings ')        
    embedding_dimension=300
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    emb_model = gloader.load(download_path)
    
    return emb_model

In [None]:
def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors, unique_words: list[str], lower: bool):

    oov_words = []

    if lower:
        words = set([x.lower() for x in unique_words])
    else: 
        words = unique_words

    for word in words:
        try: 
           embedding_model[word]
        except:
           oov_words.append(word) 
    
    print("Total number of unique words in dataset:",len(words))
    print("Total OOV terms: {0} ({1:.2f}%)".format(len(oov_words), (float(len(oov_words)) / len(words))*100))
    print("Some OOV terms:",random.sample(oov_words,15))


In [None]:
def check_value_distribution_glove(glove: gensim.models.keyedvectors.KeyedVectors):
    max_v = np.max([(np.max(glove[i])) for i in range(len(glove))])
    min_v = np.min([(np.min(glove[i])) for i in range(len(glove))])

    print('Max value inside glove embeddings:',max_v)
    print('Min value inside glove embeddings:',min_v)

#TODO cosa fa?

def build_embedding_matrix(emb_model: gensim.models.keyedvectors.KeyedVectors,
                           word2int: Dict[str, int]) -> np.ndarray:
    
    check_value_distribution_glove(emb_model)
   
    embedding_dimension = len(emb_model[0])                                                              
    embedding_matrix = np.zeros((len(word2int)+1, embedding_dimension), dtype=np.float32)

    for word, idx in word2int.items():
        try:
            embedding_vector = emb_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector
    
    print('Saving emb matrix to pickle file')
    path = os.path.join(data_folder, "emb_matrix")
    np.save(path,embedding_matrix,allow_pickle=True)

    print("Embedding matrix shape: {}".format(embedding_matrix.shape))

    return embedding_matrix

In [None]:
#check that the tokenized dataframe and the index of embeddings matrix correspond 

def check_id_corr(int2word : Dict[int,str],glove: gensim.models.keyedvectors.KeyedVectors, matrix, dataframe ):
    
    oov_words_ = []

    for token_sentence in dataframe['words_token']:

        for token in token_sentence:
            emb1 = matrix[token]
            word = int2word[token]
            emb2 = None
            try:
                emb2 = glove[word]
            except:
                oov_words_.append(word)
            if emb2 is not None:
                assert(np.array_equal(emb1,emb2))

    print('Double check OOV number:',len(set(oov_words_)))

In [None]:
if not os.path.exists(data_folder):
    print('This is the first run! Data still not present')

    os.makedirs(data_folder)

    download_and_unzip_dataset()

    df, unique_tags, unique_words = encode_dataset("dataset")

    word2int,int2word,tag2int,int2tag = build_dict(unique_words,unique_tags)

    tokenized_df = build_tokenized_dataframe(word2int,tag2int,df)

    check_dataframe_tokenization(tokenized_df,df, int2word, int2tag)

    glove_embeddings = download_glove_emb()

    check_OOV_terms(glove_embeddings, unique_words,False)

    embedding_matrix = build_embedding_matrix(glove_embeddings, word2int)
    
    check_id_corr(int2word,glove_embeddings,embedding_matrix,tokenized_df)

In [None]:
def load_data():
    emb_matrix_path = os.path.join(data_folder,'emb_matrix.npy')
    token_dataset_path = os.path.join(data_folder,'token_dataset.pkl')
    dictionaries_path = os.path.join(data_folder,'dictionaries.pkl')

    if os.path.exists(emb_matrix_path) and os.path.exists(token_dataset_path):
        print('loading embedding matrix')
        emb_matrix = np.load(emb_matrix_path,allow_pickle=True)
        print('loading tokenized dataset')
        token_dataset = pd.read_pickle(token_dataset_path)
        print('loading dictionaries')
        with open(dictionaries_path, 'rb') as f:
            word2int,int2word,tag2int,int2tag = pickle.load(f)
        
        print('all data loaded')
    else:
        print('searched data is not present in folder')
        emb_matrix, token_dataset = None, None

    return emb_matrix, token_dataset, word2int, int2word, tag2int, int2tag

emb_matrix, token_dataset, word2int,int2word,tag2int,int2tag = load_data()

token_dataset.head()

In [None]:
#pytoch import 

import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn

from torch.utils.data import Dataset
from torchtext.legacy.data import BucketIterator

#scikit-learn
from sklearn.metrics import f1_score

CREATE MODEL

In [None]:
def create_emb_layer(weights_matrix: np.ndarray, pad_idx : int):
    matrix = torch.Tensor(weights_matrix)
    _ , embedding_dim = matrix.shape 
    emb_layer = nn.Embedding.from_pretrained(matrix, freeze=True, padding_idx = pad_idx)
    
    return emb_layer, embedding_dim

class custom_model(nn.Module):

    def __init__(self, emb_matrix : np.ndarray, hidden_dim: int, tag_output_dim: int, pad_idx: int, num_lstm : int, double_dense : bool, use_GRU: bool) :
        super().__init__()

        self.embedding, embedding_dim = create_emb_layer(emb_matrix,pad_idx)

        if use_GRU:
            self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first = True, num_layers = num_lstm, bidirectional = True)
        else :
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first = True, num_layers = num_lstm, bidirectional = True)

        self.middle_dense = None 

        if double_dense:
            self.middle_dense = nn.Linear(hidden_dim*2,hidden_dim)
            self.hidden2tag = nn.Linear(hidden_dim, tag_output_dim)
        else :
            self.hidden2tag = nn.Linear(hidden_dim * 2 , tag_output_dim)


    def forward(self, sentences):
        embeds = self.embedding(sentences)
        out, _  = self.rnn(embeds)
        if self.middle_dense is not None :
            out = self.middle_dense(out)
        tag_space = self.hidden2tag(out)
        return tag_space

In [None]:
class DataframeDataset(Dataset):

    def __init__(self, dataframe: pd.DataFrame):
        self.X = dataframe['words_token']
        self.y = dataframe['tags_token']
       
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return {'words': self.X[idx],'tags': self.y[idx]}

def create_dataloaders(b_s : int):     #b_s = batch_size

    train_df = token_dataset[token_dataset['split'] == 'train'].reset_index()
    val_df = token_dataset[token_dataset['split'] == 'val'].reset_index()
    test_df = token_dataset[token_dataset['split'] == 'test'].reset_index()

    train_dataset = DataframeDataset(train_df)
    val_dataset = DataframeDataset(val_df)
    test_dataset = DataframeDataset(test_df)


    # Group similar length text sequences together in batches.
    train_dataloader,val_dataloader,test_dataloader = BucketIterator.splits((train_dataset,val_dataset,test_dataset),
                                                        batch_sizes=(b_s,b_s,b_s), sort_key=lambda x: len(x['words']), 
                                                        repeat=True, sort=False, shuffle=True, sort_within_batch=True)
    
    return train_dataloader,val_dataloader,test_dataloader 


def check_data_loaders(train_dataloader,val_dataloader,test_dataloader):

    for n,dataloader in enumerate((train_dataloader,val_dataloader,test_dataloader)):

        dataloader.create_batches() # Create batches - needs to be called before each loop.

        max_diff = -1
        for batch in dataloader.batches:

            min = np.min([len(example['words']) for example in batch])
            max = np.max([len(example['words']) for example in batch])

            diff = max - min

            if diff > max_diff: max_diff = diff 
        
        s = 'train' if n==0 else ('val' if n==1 else 'test')
        
        print('in',s+'_dataloader the maximum difference in number of tokens between two sentences in the same batch is:',max_diff)

    print('\n')

    #print random sentence from train_dataloader
    from operator import itemgetter
    train_dataloader.create_batches()
    for batch in train_dataloader.batches:
        for example in batch:
            print(*example['words'])
            print('random sentence from train_dataloader:')
            print(itemgetter(*example['words'])(int2word))
            print(itemgetter(*example['tags'])(int2tag))
            break
        break

#tr_d, va_d, te_d = create_dataloaders(64)     #TODO only for testing

In [None]:
#check_data_loaders(tr_d,va_d,te_d)     #TODO only for testing (keep in a different cell to avoid recreating everytime dataloaders)

In [None]:
#return a torch tensor which contains the indexes of the tags that we don't want to evaluate (punctuation)
def get_to_be_masked_tags():

    punctuation_tags = ['$', '``', '.', ',', '#', 'SYM', ':', "''"]
    token_punctuations = [tag2int[tag] for tag in punctuation_tags]    

    print('the indexes of punct tags:',token_punctuations) # int of punctuation's tokens
    print([int2tag[token_int] for token_int in token_punctuations]) # TODO only for testing

    return torch.LongTensor(token_punctuations+[0]) #0 is the pad token 

to_mask = get_to_be_masked_tags()

#return two tensors : the predicted labels and the true labels, both removing unwanted classes 
def reshape_and_mask(predictions: torch.Tensor,targets: torch.LongTensor):

    max_preds = predictions.argmax(dim=1)
    non_masked_elements = torch.isin(targets, to_mask, invert=True)
    
    return max_preds[non_masked_elements],targets[non_masked_elements]

def acc_and_f1(y_pred: torch.LongTensor, y_true: torch.LongTensor):

    correct = y_pred.eq(y_true)
    acc = correct.sum()/y_true.shape[0] 

    f1 = f1_score(y_true,y_pred,average='macro')

    return acc,f1

In [None]:
def train_loop(model: nn.Module, iterator : BucketIterator, optimizer: optim.Optimizer, criterion, pad_idx : int):

    batch_loss = 0
    
    tot_pred , tot_targ = torch.LongTensor(), torch.LongTensor()

    model.train()
    
    iterator.create_batches()

    for batch_id, batch in enumerate(iterator.batches):

        batch_X = [torch.LongTensor(example['words']) for example in batch]
        batch_y = [torch.LongTensor(example['tags']) for example in batch]

        padded_X = rnn.pad_sequence(batch_X, batch_first = True, padding_value = pad_idx)
        padded_y = rnn.pad_sequence(batch_y, batch_first = True, padding_value = pad_idx)


        model.zero_grad(set_to_none=True)
        optimizer.zero_grad() #TODO forse non serve ne basta uno dei due 

        predictions = model(padded_X)
        predictions = predictions.view(-1,predictions.shape[-1])
        targets = padded_y.view(-1)

        loss = criterion(predictions, targets)

        pred, targ = reshape_and_mask(predictions,targets)
        tot_pred = torch.cat((tot_pred,pred))
        tot_targ = torch.cat((tot_targ,targ))

        loss.backward()
        optimizer.step()

        batch_loss += loss.item()
        

    epoch_loss = batch_loss/(batch_id+1)
    epoch_acc, epoch_f1 = acc_and_f1(tot_pred,tot_targ)

    return epoch_loss,epoch_acc,epoch_f1


def eval_loop(model: nn.Module, iterator: BucketIterator, criterion, pad_idx):
    
    batch_loss = 0
    
    tot_pred , tot_targ = torch.LongTensor(), torch.LongTensor()
    
    model.eval()
    
    iterator.create_batches()

    with torch.no_grad():
    
        for batch_id, batch in enumerate(iterator.batches):

            batch_X = [torch.LongTensor(example['words']) for example in batch]
            batch_y = [torch.LongTensor(example['tags']) for example in batch]

            padded_X = rnn.pad_sequence(batch_X, batch_first = True, padding_value = pad_idx)
            padded_y = rnn.pad_sequence(batch_y, batch_first = True, padding_value = pad_idx)
            
            predictions = model(padded_X)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            targets = padded_y.view(-1)
            
            loss = criterion(predictions, targets)
            
            pred, targ = reshape_and_mask(predictions,targets)
            tot_pred = torch.cat((tot_pred,pred))
            tot_targ = torch.cat((tot_targ,targ))

            batch_loss += loss.item()
            

    epoch_loss = batch_loss/(batch_id+1)
    epoch_acc, epoch_f1 = acc_and_f1(tot_pred,tot_targ)

    return epoch_loss,epoch_acc,epoch_f1,tot_pred,tot_targ


def train_and_eval(n_epochs,model,optimizer,criterion,t_d,v_d,pad_idx):
    
    best_val_f1 = -1

    for epoch in range(n_epochs):

        start_time = time.time()
        
        train_epoch_loss, train_epoch_acc, train_epoch_f1 = train_loop(model, t_d, optimizer, criterion, pad_idx)
        val_epoch_loss, val_epoch_acc, val_epoch_f1, tot_pred, tot_targ = eval_loop(model, v_d, criterion, pad_idx)
        
        end_time = time.time()

        tot_epoch_time = end_time-start_time           #TODO : STAMPARE IL TEMPO 

        
        if val_epoch_f1 > best_val_f1:
            best_val_f1 = val_epoch_f1
            best_pred, best_targ = tot_pred, tot_targ
            if not os.path.exists('models'):
                os.makedirs('models')
            torch.save(model.state_dict(), 'models/model.pt')
        
        print(f'Epoch: {epoch+1:02} | Epoch Time: {tot_epoch_time:.4f}')
        print(f'\tTrain Loss: {train_epoch_loss:.3f} | Train Acc: {train_epoch_acc*100:.2f}% | Train F1: {train_epoch_f1:.2f}')
        print(f'\t Val. Loss: {val_epoch_loss:.3f} | Val. Acc: {val_epoch_acc*100:.2f}% | Val. F1: {val_epoch_f1:.2f}')
    
    return best_pred, best_targ


In [None]:
#HYPERPARAMETERS AND OBJECTS 

device = torch.device('cuda' if False else 'cpu')   #torch.cuda.is_available()

BATCH_SIZE = 64
PAD_IDX = 0
LSTM_LAYER = 1
DOUBLE_DENSE = False
USE_GRU = False
LR = 0.5
HIDDEN_STATE = 128
OUTPUT_DIM = len(tag2int)+1
N_EPOCHS = 30


criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)
criterion = criterion.to(device)

model = custom_model(emb_matrix,HIDDEN_STATE,OUTPUT_DIM,PAD_IDX,LSTM_LAYER,DOUBLE_DENSE,USE_GRU)
model = model.to(device)

optimizer = optim.SGD(model.parameters(),lr = LR)

train_dataloader,val_dataloader,test_dataloader = create_dataloaders(BATCH_SIZE)

In [None]:
pred, targ = train_and_eval(N_EPOCHS,model,optimizer,criterion,train_dataloader,val_dataloader,PAD_IDX)

TEST PER PROVARE ROBA

In [None]:
#TESTS
w_matrix = np.array([[0,0,0,0],[1,5,5,7],[2,4,7,8],[3,5,7,6],[6,3,5,4],[3,4,6,3]])
m = torch.Tensor(w_matrix)
l = nn.Embedding.from_pretrained(m, freeze=True, padding_idx = 0)


a = torch.LongTensor([[2,1,0],[3,2,3],[2,5,0]])    #input
c = torch.LongTensor([[2,3,0],[3,2,3],[1,4,0]])    #target
print('a shape',a.shape)
print('c shape',c.shape)


a1 = l(a)
print('a1 shape',a1.shape)


a2 = a1.view(-1,a1.shape[-1])
c1 = c.view(-1)
print('a2 shape',a2.shape)
print('a1 shape',c1.shape)


a3= a2.argmax(dim=1)
print('a3 shape',a3.shape)

print('a1',a1)
print('a2',a2)
print('a3',a3)
print('c1',c1)



non_mask_elements = torch.isin(c1,torch.Tensor([0,1]),invert=True)
non_mask_elements
a4= a3[non_mask_elements]
c2= c1[non_mask_elements]
print('a4',a4)
print('c2',c2)

correct = a4.eq(c2)
b = correct.sum()/c2.shape[0]
b
b+0.1


In [None]:
tot_pred , tot_targ = torch.LongTensor(),torch.LongTensor()
tot_pred = torch.cat((tot_pred,c2))
torch.cat((tot_pred,a4))

In [None]:
# #return accuracy leaving out unwanted classes (pad_idx and punctuation)
# def acc_masked(predictions,targets):

#     max_preds = predictions.argmax(dim=1)
#     non_masked_elements = torch.isin(targets,to_mask,invert=True)
#     correct = max_preds[non_masked_elements].eq(targets[non_masked_elements])
    
#     return correct.sum()/targets[non_masked_elements].shape[0] 

In [None]:
min(int2tag.keys())
max(int2tag.keys())
len(int2tag)


# l = functional.log_softmax(a1,dim=1)
# l.shape
# l

In [None]:
train_dataloader.create_batches()

for batch_id, batch in enumerate(train_dataloader.batches):
    
    
    min_len = np.min([len(torch.Tensor(example['words'])) for example in batch])
    max_len = np.max([len(torch.Tensor(example['words'])) for example in batch])
    
    batch_X = [torch.LongTensor(example['words']) for example in batch]
    batch_y = [torch.LongTensor(example['tags']) for example in batch]

    padded_X = rnn.pad_sequence(batch_X, batch_first = True, padding_value = PAD_IDX)
    padded_y = rnn.pad_sequence(batch_y, batch_first = True, padding_value = PAD_IDX)

    print(padded_X.shape)
    print(padded_y.shape)
    # print(padded_X.is_cuda)
    
    # print(type(batch_X))
    # print(type(padded_X))
    # print(min_len, max_len)
    # print(padded_X.shape)
    # print(padded_y.shape)

    # prediction = model(batch)
    break

