In [None]:
#to print all output for a cell instead of only last one 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import and fixed seed 

In [None]:
import os
import sys

import torch
import random
import numpy as np
import pandas as pd
import pickle 

# typing
from typing import Dict


torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

download data (dataset and glove)

In [None]:
#source of this code -> https://gist.github.com/hantoine/c4fc70b32c2d163f604a8dc2a050d5f6 

from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

print("Current work directory: {}".format(os.getcwd()))

data_folder = os.path.join(os.getcwd(),"data")
dataset_folder = os.path.join(data_folder,"dependency_treebank")

if not os.path.exists(data_folder):
    os.makedirs(data_folder)

dataset_url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

def download_and_unzip(url):
    if not os.path.exists(dataset_folder):
        print("downloading to",dataset_folder)
        with urlopen(url) as response:
            zipfile = ZipFile(BytesIO(response.read()))
            zipfile.extractall(path=data_folder)
    else :
        print("the dataset has been already downloaded")


download_and_unzip(dataset_url)


In [None]:

#encode dataset in pandas dataframe 

def encode_dataset(dataset_name: str) -> pd.DataFrame:
    
    dataframe_rows = []                                  #dataframe that will contain all the sentences in all the documents, each sentence as a list of word and a list of corresponding tags
    unique_tags = set()
    unique_words = set()

    for doc in os.listdir(dataset_folder):
      doc_num = int(doc[5:8])
      doc_path = os.path.join(dataset_folder,doc)

      with open(doc_path, mode='r', encoding='utf-8') as file:
        df = pd.read_csv(file,sep='\t',header=None,skip_blank_lines=False)
        df.rename(columns={0:'word',1:"TAG",2:"remove"},inplace=True)
        df.drop("remove",axis=1,inplace=True)
        
        #create another column that indicate group by sentence 
        df["group_num"] = df.isnull().all(axis=1).cumsum()
        df.dropna(inplace=True)
        df.reset_index(drop=True, inplace=True)
        
        unique_tags.update(df['TAG'].unique())     #save all the unique tags in a set 
        unique_words.update(df['word'].unique())   #save all the unique words in a set 

        #generate sentence list in a document 
        df_list = [df.iloc[rows] for _, rows in df.groupby('group_num').groups.items()]
        for n,d in enumerate(df_list) :           #for each sentence create a row in the final dataframe
            dataframe_row = {
                "split" : 'train' if doc_num<=100 else ('val' if doc_num<=150  else 'test'),
                "doc_id" : doc_num,
                "sentence_num" : n,
                "words": d['word'].tolist(),
                "tags":  d['TAG'].tolist()
            }
            dataframe_rows.append(dataframe_row)

    dataframe_path = os.path.join(data_folder, dataset_name)
    df_final = pd.DataFrame(dataframe_rows)
    df_final.to_csv(dataframe_path + ".csv")                      #save as csv to inspect
      
    return  df_final, unique_tags, unique_words


print("Encoding dataset as pandas dataframe...")
df, unique_tags, unique_words = encode_dataset("dataset")
print("Encoding completed!")


In [None]:
from collections import OrderedDict

#build the dictionaries that will be used for the embedding matrix and one hot encoding of TAGS

def build_dict(words : list[str], tags : list[str]): 
    
    word2int = OrderedDict()
    int2word = OrderedDict()

    for i, word in enumerate(words):
        word2int[word] = i
        int2word[i] = word

    tag2int = OrderedDict()
    int2tag = OrderedDict()

    for i, tag in enumerate(tags):
        tag2int[tag] = i
        int2tag[i] = tag

    return word2int,int2word,tag2int,int2tag

word2int,int2word,tag2int,int2tag = build_dict(unique_words,unique_tags)


In [None]:
#TODO: cosa fa ? 
def build_tokenized_dataframe(word2int: Dict,tag2int: Dict, df : pd.DataFrame):

    tokenized_rows = []
    for words,tags in zip(df['words'],df['tags']):
        tokenized_row = {'words_token':[word2int[word] for word in words ],'tags_token':[tag2int[tag] for tag in tags ]}
        tokenized_rows.append(tokenized_row)
    
    tokenized_df = pd.DataFrame(tokenized_rows)

    tokenized_df.insert(0,'split',df['split'])

    return tokenized_df

tokenized_df = build_tokenized_dataframe(word2int,tag2int,df)

#TODO: cosa fa ?
def check_tokenization(tokenized_df, normal_df) -> bool:

    for n, (w_t, t_t) in enumerate(zip(tokenized_df['words_token'],tokenized_df['tags_token'])):
        if not normal_df.loc[n,'words'] == [int2word[word_token] for word_token in w_t]:
            print('words tokenization gone wrong') 
            return False
        if not normal_df.loc[n,'tags'] == [int2tag[tag_token] for tag_token in t_t]:
            print('tags tokenization gone wrong')
            return False 
            
    return True

if check_tokenization(tokenized_df,df):

    print('all right with dataset tokenization')
    path = os.path.join(data_folder, "token_dataset")
    tokenized_df.to_pickle(path+'.pkl')

In [None]:
import gensim
import gensim.downloader as gloader

def download_glove_emb():
    if True :            
        embedding_dimension=300
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
        emb_model = gloader.load(download_path)
    else :
        print("is it not necessary to download GloVe embeddings at this point")
    
    return emb_model

glove_embeddings = download_glove_emb()

In [None]:
def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors,words: list[str]):

    oov_words = []

    for word in words:
        try: 
           embedding_model[word]
        except:
           oov_words.append(word) 

    return oov_words

lower = False   #set to True only to check the difference, DO NOT PUT TRUE IF IN FINAL VERSION TODO

if lower:
    words = set([x.lower() for x in unique_words])
else: 
    words = unique_words

oov_words = check_OOV_terms(glove_embeddings, words)

print("Total number of unique words in dataset:",len(words))

print("Total OOV terms: {0} ({1:.2f}%)".format(len(oov_words), (float(len(oov_words)) / len(words))*100))

print("Some OOV terms:",random.sample(oov_words,15))


In [None]:
#TODO find a faster way to do it 
def check_value_distribution_glove(glove: gensim.models.keyedvectors.KeyedVectors):

    max_v = np.max([(np.max(glove[i])) for i in range(len(glove))])
    min_v = np.min([(np.min(glove[i])) for i in range(len(glove))])

    print('Max value inside glove embeddings:',max_v)
    print('Min value inside glove embeddings:',min_v)

#TODO cosa fa?

def build_embedding_matrix(emb_model: gensim.models.keyedvectors.KeyedVectors,
                           word2int: Dict[str, int]) -> np.ndarray:
    
    check_value_distribution_glove(emb_model)
   
    embedding_dimension = len(emb_model[0])                                                              
    embedding_matrix = np.zeros((len(word2int), embedding_dimension), dtype=np.float32)

    for word, idx in word2int.items():
        try:
            embedding_vector = emb_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector
    
    print('Saving emb matrix to pickle file')
    path = os.path.join(data_folder, "emb_matrix")
    np.save(path,embedding_matrix,allow_pickle=True)

    return embedding_matrix

embedding_matrix = build_embedding_matrix(glove_embeddings, word2int)

print("Embedding matrix shape: {}".format(embedding_matrix.shape))

In [None]:
emb_matrix_path = os.path.join(data_folder,'emb_matrix.npy')
matrix = np.load(emb_matrix_path,allow_pickle=True)
np.array_equal(matrix,embedding_matrix)

In [None]:
#check that the tokenized dataframe and the index of embeddings matrix correspond 

def check_id_corr(int2word : Dict[int,str],glove: gensim.models.keyedvectors.KeyedVectors ):
    emb_matrix_path = os.path.join(data_folder,'emb_matrix.npy')
    token_dataset_path = os.path.join(data_folder,'token_dataset.pkl')

    if os.path.exists(emb_matrix_path) and os.path.exists(token_dataset_path):
        matrix = np.load(emb_matrix_path,allow_pickle=True)
        dataframe = pd.read_pickle(token_dataset_path)

        oov_words_double = []

        for token_sentence in dataframe['words_token']:

            for token in token_sentence:
                emb1 = matrix[token]
                word = int2word[token]
                emb2 = None
                try:
                    emb2 = glove[word]
                except:
                    oov_words_double.append(word)
                if emb2 is not None:
                    assert(np.array_equal(emb1,emb2))
    
    return len(set(oov_words_double))
                
            

double_check_oov = check_id_corr(int2word,glove_embeddings)

print('Double chack OOV number:',double_check_oov)



In [None]:
import gensim

