In [None]:
#to print all output for a cell instead of only last one 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import and fixed seed 

In [None]:
import os
import sys

import torch
import random
import numpy as np

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

download data (dataset and glove)

In [None]:
#source of this code -> https://gist.github.com/hantoine/c4fc70b32c2d163f604a8dc2a050d5f6 

from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

print("Current work directory: {}".format(os.getcwd()))

data_folder = os.path.join(os.getcwd(),"data")
dataset_folder = os.path.join(data_folder,"dependency_treebank")

if not os.path.exists(data_folder):
    os.makedirs(data_folder)

dataset_url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

def download_and_unzip(url):
    if not os.path.exists(dataset_folder):
        print("downloading to",dataset_folder)
        with urlopen(url) as response:
            zipfile = ZipFile(BytesIO(response.read()))
            zipfile.extractall(path=data_folder)
    else :
        print("the dataset has been already downloaded")


download_and_unzip(dataset_url)


In [None]:
import gensim
import gensim.downloader as gloader

def download_glove_emb():
    if not os.path.exists(data_folder):   #sbagliato 
        
        embedding_dimension=300
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
        emb_model = gloader.load(download_path)
    else :
        print("is it not necessary to download GloVe embeddings at this point")

print(emb_model)

In [None]:
import os
import pandas as pd

#encode dataset in pandas dataframe 

def encode_dataset(dataset_name: str) -> pd.DataFrame:
    
    dataframe_rows = []                                  #dataframe that will contain all the sentences in all the documents, each sentence as a list of word and a list of corresponding tags
    unique_tags = set()

    for doc in os.listdir(dataset_folder):
      doc_num = int(doc[5:8])
      doc_path = os.path.join(dataset_folder,doc)

      with open(doc_path, mode='r', encoding='utf-8') as file:
        df = pd.read_csv(file,sep='\t',header=None,skip_blank_lines=False)
        df.rename(columns={0:'word',1:"TAG",2:"remove"},inplace=True)
        df.drop("remove",axis=1,inplace=True)
        
        #create another column that indicate group by sentence 
        df["group_num"] = df.isnull().all(axis=1).cumsum()
        df.dropna(inplace=True)
        df.reset_index(drop=True, inplace=True)
        
        unique_tags.update(df['TAG'].unique())     #save all the unique tags in a set for one-hot encoding 

        #generate sentence list in a document 
        df_list = [df.iloc[rows] for _, rows in df.groupby('group_num').groups.items()]
        for n,d in enumerate(df_list) :           #for each sentence create a row in the final dataframe
            dataframe_row = {
                "split" : 'train' if doc_num<=100 else ('val' if doc_num<=150  else 'test'),
                "doc_id" : doc_num,
                "sentence_num" : n,
                "words": d['word'].tolist(),
                "tags":  d['TAG'].tolist()
            }
            dataframe_rows.append(dataframe_row)

    dataframe_path = os.path.join(data_folder, dataset_name)
    df_final = pd.DataFrame(dataframe_rows)
    df_final.to_csv(dataframe_path + ".csv")                      #save as csv to inspect 
    df_final.to_pickle(dataframe_path + ".pkl")                   #save as pickle 
      
    return  df_final, unique_tags


print("Encoding dataset...")
df, unique_tags = encode_dataset("dataset")
print("Encoding completed!")


In [37]:
df.head()
len(unique_tags)
print(unique_tags)

Unnamed: 0,split,doc_id,sentence_num,words,tags
0,train,1,0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ..."
1,train,1,1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ..."
2,train,2,0,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP..."
3,train,3,0,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS..."
4,train,3,1,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V..."


45

{"''", 'RB', 'MD', 'PRP', ',', 'JJS', 'WP', 'CC', 'POS', 'FW', '``', 'EX', 'RBS', 'VBG', 'RP', 'SYM', 'JJ', 'CD', 'VBZ', 'NNPS', 'UH', 'JJR', '-RRB-', 'NNS', 'WDT', 'RBR', 'NNP', '.', 'WRB', 'VBP', 'NN', ':', '-LRB-', 'PRP$', 'WP$', 'DT', 'TO', 'VB', 'VBD', 'IN', 'VBN', '$', '#', 'PDT', 'LS'}
