# 1. Crear dataframe por especialidad - 
### Directorio: df_specialty_title_abstract
### Columnas dataframe: (id, title_spa, abstract_spa) 

In [2]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [9]:
path_pubmed_xml_specialties = '../02_download_pubmed/specialties_case_report_xml/'
path_df_specialty_title_abstract = './dataframes/df_specialty_title_abstract_case_report'

In [4]:
def get_lists_pmid_title_abstract(root):
    
    list_pmid = []
    list_title = []
    list_abstract = []
    
    for PubmedArticle in root.findall('PubmedArticle'):
        
        pmid = ''
        title_spa = ''
        abstract_spa = ''
        
        for MedlineCitation in PubmedArticle.findall('MedlineCitation'):
            pmid = MedlineCitation.find('PMID').text
        
            for Article in MedlineCitation.findall('Article'):
                if not Article.find('VernacularTitle') is None:              
                    title_spa = Article.find('VernacularTitle').text
                    
                    if not title_spa is None and len(title_spa) > 1 and title_spa.isupper():
                        title_spa = title_spa.replace("'A", "Á")
                        title_spa = title_spa.replace("'E", "É")
                        title_spa = title_spa.replace("'I", "Í")
                        title_spa = title_spa.replace("'O", "Ó")
                        title_spa = title_spa.replace("'U", "Ú")
                        
            
            for OtherAbstract in MedlineCitation.findall('OtherAbstract'):
                abstrac_lang = OtherAbstract.get('Language')
                                
                if abstrac_lang == 'spa':
                    for AbstractText in OtherAbstract.findall('AbstractText'):
                        if not AbstractText.text is None:
                            abstract_spa = abstract_spa + AbstractText.text + ' '
        
        if not title_spa is None and len(title_spa) > 1: 
            list_pmid.append(pmid)
            list_title.append(title_spa)
            list_abstract.append(abstract_spa)
        
    print(len(list_pmid), len(list_title), len(list_abstract))  
    
    return list_pmid, list_title, list_abstract

In [5]:
def read_xml_file(file_xml):
    tree = ET.parse(file_xml)
    root = tree.getroot()
    return root

In [6]:
def create_dataframe_text_abstract(specialty_name, specialty_xml):
    
    xml_data = read_xml_file(specialty_xml)
    
    #pmid = get_pmid(xml_data)
    list_pmid, list_title, list_abstract = get_lists_pmid_title_abstract(xml_data)
    
    df = pd.DataFrame ( { 'id': list_pmid, 'title': list_title, 'abstract': list_abstract})
    df.to_csv(os.path.join(path_df_specialty_title_abstract, specialty_name) + '.csv')

    print("Guardado ", os.path.join(path_df_specialty_title_abstract, specialty_name) + '.csv' , ' Long: ', len(df))


In [7]:
def read_path_specialties_xml():
    
    for root, dirs, list_files in os.walk(path_pubmed_xml_specialties):
        for specialty in list_files:
            specialty_name = specialty.split(".xml")[0]
            
            print(specialty_name)

            create_dataframe_text_abstract(specialty_name, root + specialty)
            

In [10]:
read_path_specialties_xml()

H02.403.810.494_orthopedics
515 515 515
Guardado  ./dataframes/df_specialty_title_abstract_case_report/H02.403.810.494_orthopedics.csv  Long:  515
H02.403.350_genetics_medical
4416 4416 4416
Guardado  ./dataframes/df_specialty_title_abstract_case_report/H02.403.350_genetics_medical.csv  Long:  4416
H02.403.645_palliative_medicine
104 104 104
Guardado  ./dataframes/df_specialty_title_abstract_case_report/H02.403.645_palliative_medicine.csv  Long:  104
H02.403.879_tropical_medicine
88 88 88
Guardado  ./dataframes/df_specialty_title_abstract_case_report/H02.403.879_tropical_medicine.csv  Long:  88
H02.403.429.675_pulmonary_medicine
530 530 530
Guardado  ./dataframes/df_specialty_title_abstract_case_report/H02.403.429.675_pulmonary_medicine.csv  Long:  530
H02.403.429.480_infectious_disease_medicine
257 257 257
Guardado  ./dataframes/df_specialty_title_abstract_case_report/H02.403.429.480_infectious_disease_medicine.csv  Long:  257
H02.403.720.750_preventive_medicine
7195 7195 7195
Guardad

# 3. Crear dataframe con seis columnas
### Directorio: df_specialty_ngram
### Columnas dataframe: (idpubmed, título-unigramas, título-bigramas, título-trigramas, abstract-unigramas, abstract-bigramas, abstract-trigramas)

## Procesamiento de texto:
    1 Lowercase: pasar a minúscula si el término no es uppercase
    2. Unigramas: 
        2.1 borrar puntuacion
        2.2 borrar dígitos

In [8]:
import os
import pandas as pd
import os
import spacy
import ast
nlp = spacy.load('es_core_news_sm')
from nltk import everygrams
from nltk.corpus import stopwords
stop_words = set(stopwords.words('spanish')) 
import string

In [9]:
path_df_specialty_title_abstract = './pubmed_files/dataframes/df_specialty_title_abstract'
path_df_specialty_xgram_title_abstract = './pubmed_files/dataframes/df_specialty_ngram'

In [10]:
def change_to_lowercase(term):
    if not term.isupper():
        return term.lower()
    return term

In [11]:
def to_lowercase(list_terms):
    
    list_terms_new = []
    
    for term in list_terms:
        
        if type(term) == str:
             list_terms_new.append(change_to_lowercase(term))
                
        elif type(term) == tuple:
            new_tuple = ()
            
            for t in term:
                new_tuple = new_tuple + (change_to_lowercase(t),)
            list_terms_new.append(new_tuple)  
    
    #print("2",list_terms_new )
    return list_terms_new

In [12]:
def remove_stopword_punt_digit(list_tems):
    
    list_tems = [term for term in list_tems if not term in stop_words]
    list_tems = [term for term in list_tems if not term in set(string.punctuation)]
    list_tems = [term for term in list_tems if not term.isdigit() ] 
    
    #print("3", list_tems )
    return list_tems

In [13]:
def tokenize(text):
    list_tokens = []
    
    if type(text) == float:
        text = ''
    
    doc = nlp(text)
    for token in doc:
        list_tokens.append(token.text)
        
    return list_tokens

In [14]:
def obtain_grams(text, gram):
    list_tokens = tokenize(text)
    list_bigrams = list(everygrams(list_tokens, min_len=gram, max_len=gram))
    
    return list_bigrams

In [15]:
def create_new_df_xgrams(df):
    
    list_id = []
    list_ngram = []
    
    # Iteración por filas del DataFrame:
    for index, row in df.iterrows():
        
        file_id = row['id']
        abstract = row['abstract']
        title = row['title']
        
        title_unig = tokenize(title)
        title_big = obtain_grams(title ,2)
        title_trig = obtain_grams(title ,3)
        
        abstract_unig = tokenize(abstract)
        abstract_big = obtain_grams(abstract ,2)
        abstract_trig = obtain_grams(abstract ,3)
       
        ## procesamiento
        title_unig = remove_stopword_punt_digit(to_lowercase(title_unig))
        title_big = to_lowercase(title_big)
        title_trig = to_lowercase(title_trig)

        abstract_unig = remove_stopword_punt_digit(to_lowercase(abstract_unig))
        abstract_big = to_lowercase(abstract_big)
        abstract_trig = to_lowercase(abstract_trig)
        
        len_total = len(title_unig) + len(title_big) + len(title_trig) + len(abstract_unig) + len(abstract_big) + len(abstract_trig)
        
        list_id.extend([file_id] * len_total)
        list_ngram.extend(title_unig)
        list_ngram.extend(title_big)
        list_ngram.extend(title_trig)
        
        list_ngram.extend(abstract_unig)
        list_ngram.extend(abstract_big)
        list_ngram.extend(abstract_trig)
    
    new_df = pd.DataFrame ( { 'id': list_id, 
                         'ngram': list_ngram 
                         })
    return new_df
    

In [16]:
def read_df_specialties_title_abstract():

    list_df = os.listdir(path_df_specialty_title_abstract)
    for specialty_df in list_df:
        
        specialty_df = os.path.join(path_df_specialty_title_abstract, specialty_df)
        specialty_name_csv = specialty_df.split("/")[-1]
        
        df = pd.read_csv(specialty_df)
        print("Specialty: ", specialty_name_csv , ' --- Longitud del DF:', len(df))
        
        new_df = create_new_df_xgrams(df)
        
        file_out = os.path.join(path_df_specialty_xgram_title_abstract, specialty_name_csv)
        new_df.to_csv(file_out)
        print("Guardado ", file_out, ' Long: ', len(new_df))

   


In [17]:
read_df_specialties_title_abstract()

Specialty:  physical-and-rehabilitation-medicine.csv  --- Longitud del DF: 313
Guardado  ./pubmed_files/dataframes/df_specialty_ngram/physical-and-rehabilitation-medicine.csv  Long:  18424
Specialty:  reproductive-medicine.csv  --- Longitud del DF: 1116
Guardado  ./pubmed_files/dataframes/df_specialty_ngram/reproductive-medicine.csv  Long:  41619
Specialty:  geriatrics.csv  --- Longitud del DF: 37772
Guardado  ./pubmed_files/dataframes/df_specialty_ngram/geriatrics.csv  Long:  1807467
Specialty:  behavioral-medicine.csv  --- Longitud del DF: 16
Guardado  ./pubmed_files/dataframes/df_specialty_ngram/behavioral-medicine.csv  Long:  535
Specialty:  integrative-medicine.csv  --- Longitud del DF: 186
Guardado  ./pubmed_files/dataframes/df_specialty_ngram/integrative-medicine.csv  Long:  13516
Specialty:  psychiatry.csv  --- Longitud del DF: 3009
Guardado  ./pubmed_files/dataframes/df_specialty_ngram/psychiatry.csv  Long:  139026
Specialty:  aerospace-medicine.csv  --- Longitud del DF: 409
G

# 4 Crear otro tipo de estructura

In [21]:
import os
import pandas as pd
import pickle

In [22]:
path_df_specialty_xgram_title_abstract = './pubmed_files/dataframes/df_specialty_ngram'

In [23]:
def read_df_specialties_grams():
    
    dic_final = {}

    list_df = os.listdir(path_df_specialty_xgram_title_abstract)
    
    for specialty_df in list_df:
        
        dic_specialty = {}
        
        specialty_name_csv = specialty_df
        
        df = pd.read_csv(os.path.join(path_df_specialty_xgram_title_abstract, specialty_df))
        
        print("Specialty: ", specialty_name_csv , ' --- Longitud del DF:', len(df))
        
        list_docs = []
        dic_terms = {}
        
        # Iteración por filas del DataFrame:
        for index, row in df.iterrows():

            file_id = row['id']
            ngram = row['ngram']
            
            list_docs.append(file_id)
            dic_terms.setdefault(ngram, []).append(file_id)
            
        dic_specialty['terms'] = dic_terms
        dic_specialty['docs'] = list(set(list_docs))
        
        print("Nº de términos en la especialidad:", len(dic_specialty['terms']))
        print("Nº de doc en la especialidad:", len(dic_specialty['docs']))
        dic_final[specialty_name_csv] = dic_specialty
               
    with open('dic_specialties.pkl', 'wb') as f:
        pickle.dump(dic_final, f)    

In [24]:
read_df_specialties_grams()

Specialty:  physical-and-rehabilitation-medicine.csv  --- Longitud del DF: 18424
Nº de términos en la especialidad: 13183
Nº de doc en la especialidad: 276
Specialty:  reproductive-medicine.csv  --- Longitud del DF: 41619
Nº de términos en la especialidad: 25751
Nº de doc en la especialidad: 1070
Specialty:  geriatrics.csv  --- Longitud del DF: 1807467
Nº de términos en la especialidad: 634311
Nº de doc en la especialidad: 35324
Specialty:  behavioral-medicine.csv  --- Longitud del DF: 535
Nº de términos en la especialidad: 482
Nº de doc en la especialidad: 15
Specialty:  integrative-medicine.csv  --- Longitud del DF: 13516
Nº de términos en la especialidad: 10486
Nº de doc en la especialidad: 165
Specialty:  psychiatry.csv  --- Longitud del DF: 139026
Nº de términos en la especialidad: 75584
Nº de doc en la especialidad: 2834
Specialty:  aerospace-medicine.csv  --- Longitud del DF: 18838
Nº de términos en la especialidad: 13046
Nº de doc en la especialidad: 381
Specialty:  neurology.c

### Prueba varios términos en especialidad:

In [25]:
def deserialize_object(path):
    pickle_in = open(path,"rb")
    obj = pickle.load(pickle_in)
    pickle_in.close()
    print("Cargado el objeto", path.split("/")[- 1])
    return obj

In [27]:
path_dic = 'dic_specialties.pkl'
dic = deserialize_object(path_dic)

Cargado el objeto dic_specialties.pkl
[26089279, 24008534, 25690141, 25619797, 25619797, 29428270, 29428270, 26032996, 25707329, 25707329, 24119683, 22424972, 22424972, 15839820, 29616681, 29616681, 29616681, 29616681, 29616681]


In [None]:
print(dic['vaccinology.csv']['terms']["('de', 'la')"]) # 19 veces -> correcto
print(dic['vaccinology.csv']['terms']["vacuna"]) # 13 veces -> correcto



In [28]:
print(dic['vaccinology.csv']['docs'])

[16756864, 16173697, 24119683, 22424972, 19100177, 23684502, 31148440, 25690141, 23540383, 30031656, 29616681, 11188908, 29428270, 26341041, 25172540, 26089279, 25707329, 12106566, 24294728, 15384265, 26613832, 22948426, 15839820, 30262223, 25619797, 24008534, 23416026, 26032996, 24582127, 24399343, 23807343]
