## 1. Longitud de título (en tokens) por documento por especialidad 

In [54]:
import os
import pandas as pd
from nltk import word_tokenize

In [61]:
def obtain_list_tokens(phrase):
    tokens = word_tokenize(phrase, language='spanish')
    i_offset = 0
    for i, t in enumerate(tokens):
        i -= i_offset
        if (t == '%' or t == '$' or t == '€')and i > 0:
            left = tokens[:i-1]
            joined = [tokens[i - 1] + t]
            right = tokens[i + 1:]
            tokens = left + joined + right
            i_offset += 1
    
    news_tokens = []
    for t in tokens:
        if t.startswith('¿') or t.startswith('¡'):
            news_tokens.append(t[0])
            news_tokens.append(t[1:])
        else:
            news_tokens.append(t)
            
    return news_tokens

In [1]:
def get_long_of_title(df):
    
    # creamos un diccionario 73363625: 30 (id del fichero, longitud en tokens)
    dic = {}
    
    # Iteración por filas del DataFrame:
    for index, row in df.iterrows():
        
        file_id = row['id']
        #abstract = row['abstract']
        title = row['title']
        
        tokens_title = obtain_list_tokens(title)
        
        # añadimos al diccionario
        dic[file_id] = len(tokens_title)
    
    return dic

In [72]:
def read_df_specialties_title_abstract(path_df_specialty_title_abstract, path_output_file):
    
    # fichero de salida
    fout = open(path_output_file, "w")
    list_ids = []
    
    # recorremos las especialidades
    list_df = os.listdir(path_df_specialty_title_abstract)
    for specialty_df in list_df:
        
        specialty_df = os.path.join(path_df_specialty_title_abstract, specialty_df)
        specialty_name_csv = specialty_df.split("/")[-1]
        
        # leemos el dataframe (id y titulo)
        df = pd.read_csv(specialty_df)
        dic = get_long_of_title(df)
        
        for idfile, long in dic.items():

            # si el id no lo hemos incluido antes, entonces escribimos en el fichero de salida
            if not idfile in list_ids:

                fout.write(str(idfile) + '\t' + str(long) + '\n')
                list_ids.append(idfile)
                
    fout.close()

In [73]:
path_df_specialty_title_abstract = './dataframes/df_specialty_title_abstract'
path_output_file = 'len_titles.txt'
read_df_specialties_title_abstract(path_df_specialty_title_abstract, path_output_file)

## 3. Crear estructura para contener xgrams y el id del fichero

In [2]:
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
from nltk import word_tokenize
import spacy
nlp = spacy.load('es_core_news_sm')

import pandas as pd
import os
import string
import pickle
from nltk.util import ngrams
from nltk.corpus import stopwords
spa_stopwords = stopwords.words('spanish')

import re
from nltk.tokenize import WordPunctTokenizer
tokenize_intratokens = WordPunctTokenizer()

In [3]:
def change_to_lowercase(term):
    if not term is None:
        return term.lower()
    return term

In [4]:
def obtain_list_tokens(phrase):
    tokens = word_tokenize(phrase, language='spanish')
    
    # juntar caracter como 75% o 65€
    '''
    i_offset = 0
    for i, t in enumerate(tokens):
        i -= i_offset
        if (t == '%' or t == '$' or t == '€')and i > 0:
            left = tokens[:i-1]
            joined = [tokens[i - 1] + t]
            right = tokens[i + 1:]
            tokens = left + joined + right
            i_offset += 1
    '''
    # separar caracteres españoles como: "¿Cuándo" -> "¿", "Cuándo"
    news_tokens = []
    for t in tokens:
        if t.startswith('¿') or t.startswith('¡'):
            news_tokens.append(t[0])
            news_tokens.append(t[1:])
        else:
            news_tokens.append(t)
    
    tokens = news_tokens   
    
    # separar caracteres entre dígitos: "1837-1907" -> "1837", "-", "1907"
    news_tokens = []
    for t in tokens:

        # coprobamos que tiene el formato 1837-1907, no: ed-d69, 31/02/2018, ...
        if re.match("\d{4}-\d{4}", t):

            tokens_intratoken = tokenize_intratokens.tokenize(t)
            news_tokens.extend(tokens_intratoken)

        else:
            news_tokens.append(t)
        
    tokens = news_tokens   
        
    return tokens

In [16]:
def obtain_sentence_with_lemma(phrase):
    doc = nlp(phrase)
    new_phrase_lemma = phrase
    
    for token in doc:
        new_phrase_lemma = new_phrase_lemma.replace(token.text,  token.lemma_)
    
    return new_phrase_lemma

In [6]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ";" or token.text == '!' or token.text == '?':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before="parser")

def obtain_list_sentences(phrase):
    doc = nlp(phrase)
    return [sent.text for sent in doc.sents]

In [7]:
def how_many_stopword(tupla):
    stopword_frecuency = 0
    for token in list(tupla):
        if token in spa_stopwords:
            stopword_frecuency += 1
    
    return stopword_frecuency

In [33]:
punctuations = string.punctuation + '«»¿¡``·‒–—―‘’‚“”„®©'
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~«»¿¡``·‒–—―‘’‚“”„®©


In [9]:
def how_many_punctuation(tupla):
    punct_frecuency = 0
    
    for token in list(tupla):
        if token in punctuations or token == '...' or token == "''":
            punct_frecuency += 1
    
    return punct_frecuency

In [36]:
def generate_dic_specialty(df, list_terms_total):
    
    # nuevo diccionario para cada especialidad
    dic_specialty = {}
    
    # para tener el total de docs de cada especialidad
    list_docs_specialty = []
    
    # diccionario de términos
    dic_terms = {}
    
    # Iteración por filas del DataFrame:
    for index, row in df.iterrows():
        
        file_id = row['id']
        list_docs_specialty.append(file_id)
        
        
        title = row['title']
        sentences = obtain_list_sentences(title)
        
        for sentence in sentences:
            #print(file_id, sentence)
            
            #sentence = obtain_sentence_with_lemma(sentence)
            tokens = obtain_list_tokens(sentence)
            tokens = [token for token in tokens if len(token) > 0]
            
            #print(tokens)

            # UNIGRAMAS
            for token in tokens:
                token = change_to_lowercase(token)
                
                tup = (token)
                dic_terms.setdefault(tup, []).append(file_id)
                
                tup = (token)
                
                if not token in punctuations:
                    
                    list_terms_total.append(token)
                    
                    #print("1GRAM", str(tup))
                    dic_terms.setdefault(tup, []).append(file_id)
                
            #BIGRAMAS
            for w1, w2 in bigrams(tokens) : #, pad_right=False, pad_left=True):
                w1 = change_to_lowercase(w1)
                w2 = change_to_lowercase(w2)
                
                tup = (w1,w2)
                
                # si hay algun stopword
                # si hay más de un signo de puntuación
                if not how_many_stopword(tup) >= 1 and not how_many_punctuation(tup) >= 1: 
                    
                    #print("2GRAM", tup)
                    dic_terms.setdefault(tup, []).append(file_id)
                #else:
                #    print("2GRAM NOT", tup)
            
            #TRIGRAMAS
            for w1, w2, w3 in trigrams(tokens): #, pad_right=True, pad_left=True):
                w1 = change_to_lowercase(w1)
                w2 = change_to_lowercase(w2)
                w3 = change_to_lowercase(w3)
                
                tup = (w1,w2,w3)
                
                # si hay dos stopword
                # si la 3º palabra es stopword
                # si la 1º palabra es puntuacion
                # si la 3º palabra es puntuacion
                if not how_many_stopword(tup) >=2 and \
                    not w3 in spa_stopwords and \
                    not w1 in punctuations and not w1 == "..." and not w1 == "''" and \
                    not w3 in punctuations and not w3 == "..." and not w3 == "''":
                    #print("3GRAM", tup)
                    dic_terms.setdefault(tup, []).append(file_id)
                
                #else:
                #    print("3GRAM NOT", tup)
            '''
            #4GRAMS     
            for w1, w2, w3, w4 in ngrams(tokens,4):
                w1 = change_to_lowercase(w1)
                w2 = change_to_lowercase(w2)
                w3 = change_to_lowercase(w3)
                w4 = change_to_lowercase(w4)
                
                #print("4GRAM", str(w1) + ' ' + str(w2) + ' ' + str(w3) +  ' ' + str(w4))
                
                # creamos la tupla y la añadimos
                tup = (w1,w2,w3,w4)
                dic_terms.setdefault(tup, []).append(file_id)
            
            #5GRAMS     
            for w1, w2, w3, w4, w5 in ngrams(tokens,5):
                w1 = change_to_lowercase(w1)
                w2 = change_to_lowercase(w2)
                w3 = change_to_lowercase(w3)
                w4 = change_to_lowercase(w4)
                w5 = change_to_lowercase(w5)
                
                # creamos la tupla y la añadimos
                tup = (w1,w2,w3,w4,w5)
                dic_terms.setdefault(tup, []).append(file_id)
            '''
            
    dic_specialty['terms'] = dic_terms
    dic_specialty['docs'] = list(set(list_docs_specialty))
    
    return dic_specialty, list_terms_total
        

In [37]:
def read_df_specialties_title_abstract(path_df_specialty_title_abstract):
    
    dic_final = {}
    list_terms_total = [] # Palabras en el corpus
    
    list_df = os.listdir(path_df_specialty_title_abstract)
    # recorremos el listado de especialidades
    for index, specialty_df in enumerate(list_df):
        
        specialty_df = os.path.join(path_df_specialty_title_abstract, specialty_df)
        specialty_name_csv = specialty_df.split("/")[-1]
        specialty_name = specialty_name_csv.split(".csv")[0]
        
        # leemos el dataframe
        df = pd.read_csv(specialty_df)
        print(index, "- specialty: ", specialty_name_csv , ' --- Longitud del DF:', len(df))
        
        dic_specialty, list_terms_total = generate_dic_specialty(df, list_terms_total)
        
        dic_final[specialty_name] = dic_specialty
        
    with open('dic_specialties_1_3_gramas.pkl', 'wb') as f:
        pickle.dump(dic_final, f)
    
    with open("vocabulario.txt", "w") as f:
        for term in list_terms_total:
            f.write(str(term) + '\n')
    
    print("Nº de palabras en el corpus:", len(list_terms_total))
    print("Nº de palabras diferentes en el corpus:", len(set(list_terms_total)))

In [38]:
path_df_specialty_title_abstract = './dataframes/df_specialty_title_abstract'
read_df_specialties_title_abstract(path_df_specialty_title_abstract)

0 - specialty:  H02.403.340_general_practice.csv  --- Longitud del DF: 219
1 - specialty:  H02.403.429.515_medical_oncology.csv  --- Longitud del DF: 24946
2 - specialty:  H02.403.330_forensic_medicine.csv  --- Longitud del DF: 371
3 - specialty:  H02.403.810.468_ophthalmology.csv  --- Longitud del DF: 4697
4 - specialty:  H02.403.810.788_surgery_plastic.csv  --- Longitud del DF: 1174
5 - specialty:  H02.403.429.480_infectious_disease_medicine.csv  --- Longitud del DF: 5283
6 - specialty:  H02.403.600_neurology.csv  --- Longitud del DF: 23912
7 - specialty:  H02.403.763_reproductive_medicine.csv  --- Longitud del DF: 1627
8 - specialty:  H02.403.690_psychiatry.csv  --- Longitud del DF: 3071
9 - specialty:  H02.403.429.730_rheumatology.csv  --- Longitud del DF: 3428
10 - specialty:  H02.403.740_radiology.csv  --- Longitud del DF: 2084
11 - specialty:  H02.403.044.500_immunochemistry.csv  --- Longitud del DF: 7490
12 - specialty:  H02.403.429.580_nephrology.csv  --- Longitud del DF: 7601

## 4. Obtenemos los ids de documentos de cada especialidad

In [2]:
import os
import pickle

In [3]:
def deserialize_object(path):
    pickle_in = open(path,"rb")
    obj = pickle.load(pickle_in)
    pickle_in.close()
    print("Cargado el objeto", path.split("/")[- 1])
    return obj

In [24]:
def read_diccionary_specialties(dic_spe, path_out_file):
    fout = open(path_out_file, "w")
    
    for specialty, dic in dic_spe.items():
        
        #print("Especialidad:", specialty)
        
        list_doc_specialty = dic['docs']
        fout.write(specialty + '\t' + str(list_doc_specialty) + '\n')
    fout.close()
    print("Fichero escrito: ", path_out_file)

In [5]:
path_dic = 'dic_specialties.pkl'
path_out_file = 'idfiles_by_specialty.txt'
dic_spe = deserialize_object(path_dic)

Cargado el objeto dic_specialties.pkl


In [25]:
read_diccionary_specialties(dic_spe, path_out_file)

Fichero escrito:  idfiles_by_specialty.txt
