In [None]:
#to print all output for a cell instead of only last one 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import and fixed seed 

In [None]:
import os
import sys

import torch
import random
import numpy as np
import pandas as pd
import pickle 

import gensim
import gensim.downloader as gloader

# typing
from typing import Dict


torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [None]:
print("Current work directory: {}".format(os.getcwd()))

data_folder = os.path.join(os.getcwd(),"data")

download data (dataset and glove)

In [None]:
#source of this code -> https://gist.github.com/hantoine/c4fc70b32c2d163f604a8dc2a050d5f6 

from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile


def download_and_unzip_dataset():

    dataset_folder = os.path.join(data_folder,"dependency_treebank")

    url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

    if not os.path.exists(dataset_folder):
        print('downloading and extracting dataset to :',dataset_folder)
        with urlopen(url) as response:
            zipfile = ZipFile(BytesIO(response.read()))
            zipfile.extractall(path=data_folder)
    else :
        print("the dataset has been already downloaded")

In [None]:
#encode dataset in pandas dataframe 

def encode_dataset(dataset_name: str) -> pd.DataFrame:

    print("Encoding dataset as pandas dataframe...")

    dataset_folder = os.path.join(data_folder,"dependency_treebank")
    
    dataframe_rows = []             #dataframe that will contain all the sentences in all the documents, each sentence as a list of word and a list of corresponding tags
    unique_tags = set()
    unique_words = set()

    for doc in os.listdir(dataset_folder):
      doc_num = int(doc[5:8])
      doc_path = os.path.join(dataset_folder,doc)

      with open(doc_path, mode='r', encoding='utf-8') as file:
        df = pd.read_csv(file,sep='\t',header=None,skip_blank_lines=False)
        df.rename(columns={0:'word',1:"TAG",2:"remove"},inplace=True)
        df.drop("remove",axis=1,inplace=True)
        
        #create another column that indicate group by sentence 
        df["group_num"] = df.isnull().all(axis=1).cumsum()
        df.dropna(inplace=True)
        df.reset_index(drop=True, inplace=True)
        
        unique_tags.update(df['TAG'].unique())     #save all the unique tags in a set 
        unique_words.update(df['word'].unique())   #save all the unique words in a set 

        #generate sentence list in a document 
        df_list = [df.iloc[rows] for _, rows in df.groupby('group_num').groups.items()]
        for n,d in enumerate(df_list) :           #for each sentence create a row in the final dataframe
            dataframe_row = {
                "split" : 'train' if doc_num<=100 else ('val' if doc_num<=150  else 'test'),
                "doc_id" : doc_num,
                "sentence_num" : n,
                "words": d['word'].tolist(),
                "tags":  d['TAG'].tolist(),
                "num_tokens": len(d['word'])
            }
            dataframe_rows.append(dataframe_row)

    dataframe_path = os.path.join(data_folder, dataset_name)
    df_final = pd.DataFrame(dataframe_rows)
    df_final.to_csv(dataframe_path + ".csv")                      #save as csv to inspect

    print("Encoding completed!")
      
    return  df_final, unique_tags, unique_words

In [None]:
from collections import OrderedDict

#build the dictionaries that will be used for the embedding matrix and one hot encoding of TAGS
#starting from 1, 0 is reserved to PAD

def build_dict(words : list[str], tags : list[str]): 
    
    word2int = OrderedDict()
    int2word = OrderedDict()

    for i, word in enumerate(words):
        word2int[word] = i+1
        int2word[i+1] = word

    tag2int = OrderedDict()
    int2tag = OrderedDict()

    for i, tag in enumerate(tags):
        tag2int[tag] = i+1
        int2tag[i+1] = tag
    
    print('saving dictionaries as pickle files')
    pickle_files = [word2int,int2word,tag2int,int2tag]
    files_path = os.path.join(data_folder,'dictionaries.pkl')
    with open(files_path, 'wb') as f:
        pickle.dump(pickle_files, f)

    return word2int,int2word,tag2int,int2tag

In [None]:
#TODO: cosa fa ? 
def build_tokenized_dataframe(word2int: Dict,tag2int: Dict, df : pd.DataFrame):

    print('Initiating tokenization of words and tags in dataframe')
    tokenized_rows = []
    for words,tags in zip(df['words'],df['tags']):
        tokenized_row = {'words_token':[word2int[word] for word in words ],'tags_token':[tag2int[tag] for tag in tags ]}
        tokenized_rows.append(tokenized_row)
    
    tokenized_df = pd.DataFrame(tokenized_rows)

    tokenized_df.insert(0,'split',df['split'])
    tokenized_df.insert(1,'num_tokens',df['num_tokens'])

    print('Tokenization completed')

    return tokenized_df

#TODO: cosa fa ?
def check_dataframe_tokenization(tokenized_df, normal_df, int2word, int2tag) :

    for n, (w_t, t_t) in enumerate(zip(tokenized_df['words_token'],tokenized_df['tags_token'])):
        if not normal_df.loc[n,'words'] == [int2word[word_token] for word_token in w_t]:
            print('words tokenization gone wrong') 
            return False
        if not normal_df.loc[n,'tags'] == [int2tag[tag_token] for tag_token in t_t]:
            print('tags tokenization gone wrong')
            return False 
    
    print('all right with dataset tokenization')
    print('saving tokenized dataframe')
    path = os.path.join(data_folder, "token_dataset")
    tokenized_df.to_pickle(path+'.pkl')


In [None]:
def download_glove_emb():   
    
    print('downloading glove embeddings ')        
    embedding_dimension=300
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    emb_model = gloader.load(download_path)
    
    return emb_model

In [None]:
def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors, unique_words: list[str], lower: bool):

    oov_words = []

    if lower:
        words = set([x.lower() for x in unique_words])
    else: 
        words = unique_words

    for word in words:
        try: 
           embedding_model[word]
        except:
           oov_words.append(word) 
    
    print("Total number of unique words in dataset:",len(words))
    print("Total OOV terms: {0} ({1:.2f}%)".format(len(oov_words), (float(len(oov_words)) / len(words))*100))
    print("Some OOV terms:",random.sample(oov_words,15))


In [None]:
def check_value_distribution_glove(glove: gensim.models.keyedvectors.KeyedVectors):
    max_v = np.max([(np.max(glove[i])) for i in range(len(glove))])
    min_v = np.min([(np.min(glove[i])) for i in range(len(glove))])

    print('Max value inside glove embeddings:',max_v)
    print('Min value inside glove embeddings:',min_v)

#TODO cosa fa?

def build_embedding_matrix(emb_model: gensim.models.keyedvectors.KeyedVectors,
                           word2int: Dict[str, int]) -> np.ndarray:
    
    check_value_distribution_glove(emb_model)
   
    embedding_dimension = len(emb_model[0])                                                              
    embedding_matrix = np.zeros((len(word2int)+1, embedding_dimension), dtype=np.float32)

    for word, idx in word2int.items():
        try:
            embedding_vector = emb_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector
    
    print('Saving emb matrix to pickle file')
    path = os.path.join(data_folder, "emb_matrix")
    np.save(path,embedding_matrix,allow_pickle=True)

    print("Embedding matrix shape: {}".format(embedding_matrix.shape))

    return embedding_matrix

In [None]:
#check that the tokenized dataframe and the index of embeddings matrix correspond 

def check_id_corr(int2word : Dict[int,str],glove: gensim.models.keyedvectors.KeyedVectors, matrix, dataframe ):
    
    oov_words_ = []

    for token_sentence in dataframe['words_token']:

        for token in token_sentence:
            emb1 = matrix[token]
            word = int2word[token]
            emb2 = None
            try:
                emb2 = glove[word]
            except:
                oov_words_.append(word)
            if emb2 is not None:
                assert(np.array_equal(emb1,emb2))

    print('Double check OOV number:',len(set(oov_words_)))

In [None]:
if not os.path.exists(data_folder):
    print('This is the first run! Data still not present')

    os.makedirs(data_folder)

    download_and_unzip_dataset()
    df, unique_tags, unique_words = encode_dataset("dataset")
    word2int,int2word,tag2int,int2tag = build_dict(unique_words,unique_tags)
    tokenized_df = build_tokenized_dataframe(word2int,tag2int,df)
    check_dataframe_tokenization(tokenized_df,df, int2word, int2tag)
    glove_embeddings = download_glove_emb()
    check_OOV_terms(glove_embeddings, unique_words,False)
    embedding_matrix = build_embedding_matrix(glove_embeddings, word2int)
    check_id_corr(int2word,glove_embeddings,embedding_matrix,tokenized_df)

In [None]:
def load_data():
    emb_matrix_path = os.path.join(data_folder,'emb_matrix.npy')
    token_dataset_path = os.path.join(data_folder,'token_dataset.pkl')
    dictionaries_path = os.path.join(data_folder,'dictionaries.pkl')

    if os.path.exists(emb_matrix_path) and os.path.exists(token_dataset_path):
        print('loading embedding matrix')
        emb_matrix = np.load(emb_matrix_path,allow_pickle=True)
        print('loading tokenized dataset')
        token_dataset = pd.read_pickle(token_dataset_path)
        print('loading dictionaries')
        with open(dictionaries_path, 'rb') as f:
            word2int,int2word,tag2int,int2tag = pickle.load(f)
        
        print('all data loaded')
    else:
        print('searched data is not present in folder')
        emb_matrix, token_dataset = None, None

    return emb_matrix, token_dataset, word2int, int2word, tag2int, int2tag

emb_matrix, token_dataset, word2int,int2word,tag2int,int2tag = load_data()

token_dataset.head()


CREATE MODEL

In [None]:
import torch.nn as nn
import torch.nn.functional as F

def create_emb_layer(weights_matrix: np.ndarray, pad_idx : int, non_trainable=True):
    matrix = torch.Tensor(weights_matrix)
    num_embeddings, embedding_dim = matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim, padding_idx = pad_idx)
    emb_layer.from_pretrained(matrix, freeze=True, padding_idx = pad_idx)
    
    return emb_layer, num_embeddings, embedding_dim

class custom_model(nn.Module):

    def __init__(self, emb_matrix : np.ndarray, hidden_dim: int, tag_output_dim: int, pad_idx: int, num_lstm : int) :
        super().__init__()

        self.embedding, num_embeddings, embedding_dim = create_emb_layer(emb_matrix,pad_idx)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True, num_layers = num_lstm, bidirectional = True)
    
        self.hidden2tag = nn.Linear(hidden_dim * 2 , tag_output_dim)

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        lstm_out, _  = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
token_dataset.iloc[[0,2]]
a = token_dataset['words_token']
type(a)
len(a)
torch.Tensor((a[2]))

In [None]:
from torch.utils.data import Dataset
from torchtext.legacy.data import BucketIterator

class DataframeDataset(Dataset):

    def __init__(self, dataframe: pd.DataFrame):
        self.X = dataframe['words_token']
        self.y = dataframe['tags_token']
       
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return {'words': self.X[idx],'tags': self.y[idx]}

train_df = token_dataset[token_dataset['split'] == 'train'].reset_index()
val_df = token_dataset[token_dataset['split'] == 'val'].reset_index()
test_df = token_dataset[token_dataset['split'] == 'test'].reset_index()

train_dataset = DataframeDataset(train_df)
val_dataset = DataframeDataset(val_df)
test_dataset = DataframeDataset(test_df)


# Group similar length text sequences together in batches.
train_dataloader,val_dataloader,test_dataloader = BucketIterator.splits((train_dataset,val_dataset,test_dataset),
                                                    batch_sizes=(64,64,64), sort_key=lambda x: len(x['words']), 
                                                    repeat=True, sort=False, shuffle=True, sort_within_batch=True)


In [None]:

def check_data_loaders():
    
    for n,dataloader in enumerate((train_dataloader,val_dataloader,test_dataloader)):

        dataloader.create_batches() # Create batches - needs to be called before each loop.

        max_diff = -1
        for batch in dataloader.batches:

            min = np.min([len(example['words']) for example in batch])
            max = np.max([len(example['words']) for example in batch])

            diff = max - min

            if diff > max_diff: max_diff = diff 
        
        s = 'train' if n==0 else ('val' if n==1 else 'test')
        
        print('in',s+'_dataloader the maximum difference in number of tokens between two sentences in the same batch is:',max_diff)

    print('\n')
    #print random sentence from train_dataloader
    from operator import itemgetter
    train_dataloader.create_batches()
    for batch in train_dataloader.batches:
        for example in batch:
            print(type(example['words']))
            print('random sentence from train_dataloader:')
            print(itemgetter(*example['words'])(int2word))
            print(itemgetter(*example['tags'])(int2tag))
            break
        break


check_data_loaders()



In [None]:
# model = custom_model(emb_matrix_tensor,128,45,0,1)
# train_dataloader.create_batches()
# for bartch_id, batch in enumerate(train_dataloader.batches):
#     batch_text = [example['words'] for example in batch]
#     print(batch_text)
#     # prediction = model(batch)
#     break 


In [None]:
import gensim

