## 1. Longitud de título (en tokens) por documento por especialidad 

In [54]:
import os
import pandas as pd
from nltk import word_tokenize

In [61]:
def obtain_list_tokens(phrase):
    tokens = word_tokenize(phrase, language='spanish')
    i_offset = 0
    for i, t in enumerate(tokens):
        i -= i_offset
        if (t == '%' or t == '$' or t == '€')and i > 0:
            left = tokens[:i-1]
            joined = [tokens[i - 1] + t]
            right = tokens[i + 1:]
            tokens = left + joined + right
            i_offset += 1
    
    news_tokens = []
    for t in tokens:
        if t.startswith('¿') or t.startswith('¡'):
            news_tokens.append(t[0])
            news_tokens.append(t[1:])
        else:
            news_tokens.append(t)
            
    return news_tokens

In [1]:
def get_long_of_title(df):
    
    # creamos un diccionario 73363625: 30 (id del fichero, longitud en tokens)
    dic = {}
    
    # Iteración por filas del DataFrame:
    for index, row in df.iterrows():
        
        file_id = row['id']
        #abstract = row['abstract']
        title = row['title']
        
        tokens_title = obtain_list_tokens(title)
        
        # añadimos al diccionario
        dic[file_id] = len(tokens_title)
    
    return dic

In [72]:
def read_df_specialties_title_abstract(path_df_specialty_title_abstract, path_output_file):
    
    # fichero de salida
    fout = open(path_output_file, "w")
    list_ids = []
    
    # recorremos las especialidades
    list_df = os.listdir(path_df_specialty_title_abstract)
    for specialty_df in list_df:
        
        specialty_df = os.path.join(path_df_specialty_title_abstract, specialty_df)
        specialty_name_csv = specialty_df.split("/")[-1]
        
        # leemos el dataframe (id y titulo)
        df = pd.read_csv(specialty_df)
        dic = get_long_of_title(df)
        
        for idfile, long in dic.items():

            # si el id no lo hemos incluido antes, entonces escribimos en el fichero de salida
            if not idfile in list_ids:

                fout.write(str(idfile) + '\t' + str(long) + '\n')
                list_ids.append(idfile)
                
    fout.close()

In [73]:
path_df_specialty_title_abstract = './dataframes/df_specialty_title_abstract'
path_output_file = 'len_titles.txt'
read_df_specialties_title_abstract(path_df_specialty_title_abstract, path_output_file)

## 2. Crear fichero de vocabulario

In [87]:
import os
import pandas as pd
from nltk import word_tokenize

In [88]:
def obtain_list_tokens(phrase):
    tokens = word_tokenize(phrase, language='spanish')
    i_offset = 0
    for i, t in enumerate(tokens):
        i -= i_offset
        if (t == '%' or t == '$' or t == '€')and i > 0:
            left = tokens[:i-1]
            joined = [tokens[i - 1] + t]
            right = tokens[i + 1:]
            tokens = left + joined + right
            i_offset += 1
    
    news_tokens = []
    for t in tokens:
        if t.startswith('¿') or t.startswith('¡'):
            news_tokens.append(t[0])
            news_tokens.append(t[1:])
        else:
            news_tokens.append(t)
            
    return news_tokens

In [94]:
def get_tokens(df):
    
    tokens_title = []
    # Iteración por filas del DataFrame:
    for index, row in df.iterrows():
        
        #file_id = row['id']
        #abstract = row['abstract']
        title = row['title']
        
        tokens_title.extend(obtain_list_tokens(title))
    
    return tokens_title

In [104]:
def read_df_specialties_title_abstract(path_df_specialty_title_abstract, path_output_file):
    
    # fichero de salida
    fout = open(path_output_file, "w")
    list_terms = []
    cont_vocabulary = 0
    tam_total_vocabulary = 0
    
    # recorremos las especialidades
    list_df = os.listdir(path_df_specialty_title_abstract)
    for specialty_df in list_df:
        
        specialty_df = os.path.join(path_df_specialty_title_abstract, specialty_df)
        specialty_name_csv = specialty_df.split("/")[-1]
        
        #print(specialty_name_csv)
        
        # leemos el dataframe (id y titulo)
        df = pd.read_csv(specialty_df)
        tokens = get_tokens(df)
        tam_total_vocabulary += len(tokens)
        
        for term in tokens:
            # si el término no lo hemos incluido antes, entonces escribimos en el fichero de salida
            if not term in list_terms:

                fout.write(str(term) + '\n')
                list_terms.append(term)
                cont_vocabulary += 1
    fout.close()
    print("Longitud del vocabulario (sin repetidos):", cont_vocabulary)
    print("Longitud del vocabulario (con repetidos):", tam_total_vocabulary)

In [105]:
path_df_specialty_title_abstract = './dataframes/df_specialty_title_abstract'
path_output_file = 'vocabulary.txt'
read_df_specialties_title_abstract(path_df_specialty_title_abstract, path_output_file)

Longitud del vocabulario (sin repetidos): 115129
Longitud del vocabulario (con repetidos): 6468959


## 3. Crear estructura para contener xgrams y el id del fichero

In [3]:
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
from nltk import word_tokenize, sent_tokenize
import pandas as pd
import os
import string
import pickle

In [4]:
def change_to_lowercase(term):
    if not term is None:
        return term.lower()
    return term

In [5]:
def obtain_list_tokens(phrase):
    tokens = word_tokenize(phrase, language='spanish')
    i_offset = 0
    for i, t in enumerate(tokens):
        i -= i_offset
        if (t == '%' or t == '$' or t == '€')and i > 0:
            left = tokens[:i-1]
            joined = [tokens[i - 1] + t]
            right = tokens[i + 1:]
            tokens = left + joined + right
            i_offset += 1
    
    news_tokens = []
    for t in tokens:
        if t.startswith('¿') or t.startswith('¡'):
            news_tokens.append(t[0])
            news_tokens.append(t[1:])
        else:
            news_tokens.append(t)
            
    return news_tokens

In [6]:
def obtain_list_sentences(phrase):
    return sent_tokenize(phrase)

In [7]:
def generate_dic_specialty(df):
    
    # nuevo diccionario para cada especialidad
    dic_specialty = {}
    
    # para tener el total de docs de cada especialidad
    list_docs_specialty = []
    
    # diccionario de términos
    dic_terms = {}
    
    # Iteración por filas del DataFrame:
    for index, row in df.iterrows():
        
        file_id = row['id']
        list_docs_specialty.append(file_id)
        
        
        title = row['title']
        sentences = obtain_list_sentences(title)
        
        for sentence in sentences:
            #print(file_id, sentence)
            
            tokens = obtain_list_tokens(sentence)
            #print(tokens)
            
            # UNIGRAMAS
            for token in tokens:
                token = change_to_lowercase(token)
                
                if not token in set(string.punctuation) and not token.isdigit():
                    #print("1GRAM", str(token))
                    
                    # creamos la tupla y la añadimos
                    tup = (token)
                    dic_terms.setdefault(tup, []).append(file_id)
            
            #BIGRAMAS
            for w1, w2 in bigrams(tokens, pad_right=True, pad_left=True):
                w1 = change_to_lowercase(w1)
                w2 = change_to_lowercase(w2)
                
                #print("2GRAM", str(w1) + ' ' + str(w2) )
                
                # creamos la tupla y la añadimos
                tup = (w1,w2)
                dic_terms.setdefault(tup, []).append(file_id)
                
            
            #TRIGRAMAS
            for w1, w2, w3 in trigrams(tokens, pad_right=True, pad_left=True):
                w1 = change_to_lowercase(w1)
                w2 = change_to_lowercase(w2)
                w3 = change_to_lowercase(w3)
                
                #print("3GRAM", str(w1) + ' ' + str(w2) + ' ' + str(w3))
                
                # creamos la tupla y la añadimos
                tup = (w1,w2,w3)
                dic_terms.setdefault(tup, []).append(file_id)
    
    dic_specialty['terms'] = dic_terms
    dic_specialty['docs'] = list(set(list_docs_specialty))
    
    return dic_specialty
        

In [10]:
def read_df_specialties_title_abstract(path_df_specialty_title_abstract):
    
    dic_final = {}
    
    list_df = os.listdir(path_df_specialty_title_abstract)
    # recorremos el listado de especialidades
    for specialty_df in list_df:
        
        specialty_df = os.path.join(path_df_specialty_title_abstract, specialty_df)
        specialty_name_csv = specialty_df.split("/")[-1]
        specialty_name = specialty_name_csv.split(".csv")[0]
        
        # leemos el dataframe
        df = pd.read_csv(specialty_df)
        print("Specialty: ", specialty_name_csv , ' --- Longitud del DF:', len(df))
        
        dic_specialty = generate_dic_specialty(df)

        dic_final[specialty_name] = dic_specialty
        
    with open('dic_specialties.pkl', 'wb') as f:
        pickle.dump(dic_final, f)   

## 4. Obtenemos los ids de documentos de cada especialidad

In [2]:
import os
import pickle

In [3]:
def deserialize_object(path):
    pickle_in = open(path,"rb")
    obj = pickle.load(pickle_in)
    pickle_in.close()
    print("Cargado el objeto", path.split("/")[- 1])
    return obj

In [24]:
def read_diccionary_specialties(dic_spe, path_out_file):
    fout = open(path_out_file, "w")
    
    for specialty, dic in dic_spe.items():
        
        #print("Especialidad:", specialty)
        
        list_doc_specialty = dic['docs']
        fout.write(specialty + '\t' + str(list_doc_specialty) + '\n')
    fout.close()
    print("Fichero escrito: ", path_out_file)

In [5]:
path_dic = 'dic_specialties.pkl'
path_out_file = 'idfiles_by_specialty.txt'
dic_spe = deserialize_object(path_dic)

Cargado el objeto dic_specialties.pkl


In [25]:
read_diccionary_specialties(dic_spe, path_out_file)

Fichero escrito:  idfiles_by_specialty.txt
