In [38]:
import json
import regex
import yaml
import logging
import numpy as np 
import pandas as pd
import processing
from unidecode import unidecode
from time import sleep
from pathlib import Path

import spacy
spacy.require_gpu()
# spacy didn't work without this import (bug)
from torch.utils import dlpack

In [2]:
import importlib
import sys

spec_src = importlib.util.spec_from_file_location(
    'src', 
    '../../__init__.py')
m = importlib.util.module_from_spec(spec_src)
sys.modules[spec_src.name] = m
spec_src.loader.exec_module(m)

from src import utils

In [8]:
logger = utils.get_logger('processing-corpes-test')

In [7]:
"""
Reload module
"""
importlib.reload(processing)

<module 'processing' from '/home/rimov/Documents/Code/NLP/lin-que-dropping/src/analysis/processing/processing.py'>

In [4]:
gen_conf = utils.get_config()
conf = utils.get_config('p')

In [5]:
es_conj_path = utils.get_project_root()/gen_conf['file_paths']['verb_conjug']
cleaned_folder = '12062021'
cleaned_path = utils.get_save_path('c', 'corpes', lang='es', is_test=True)/cleaned_folder
processed_folder = cleaned_folder
processed_path = utils.get_save_path('p', 'corpes', lang='es', is_test=True)/processed_folder

In [6]:
es_conjugs = pd.read_excel(es_conj_path)
display(es_conjugs.head(2))

es_verbs = set(es_conjugs['verb'].to_numpy())

Unnamed: 0,verb_type,verb,indicativo,imperativo,subjuntivo,gerundio,gerundio_compuesto,infinitivo,infinitivo_compuesto,participio_pasado
0,Stative,ver,veía visto verías vi vimos verían ves v...,vean ve vea veamos ved,veáis visto vieras vieren viesen veas vi...,viendo,visto,ver,visto,visto
1,Stative,jurar,jurarán juramos jurarías jurabas juraría ...,jurad jura juren jure juremos,jurare jurareis jurase jurara juraren jur...,jurando,jurado,jurar,jurado,jurado


### Running through spaCy pipeline

In [10]:
# Disable the Named Entity Recognizer
nlp_es = spacy.load(conf['spacy']['es'], disable=['ner'])

In [49]:
def get_normd(tokenized):
    normd = ''
    
    for t in tokenized:
        token = unidecode(t.text).lower()
        
        if t.pos_=='PUNCT':
            normd+=f'{t.text}'
            continue
        
        if token=='que' or token=='q':
            normd+=f' {t.text.upper()}'
            continue
        
        if (t.pos_=='VERB') and (t.lemma_ in es_verbs) and (t.dep_=='ccomp'):
            normd+=f' <<{t.text.upper()}>>'
            continue
        
        normd+=f' {t.text.lower()}'
    
    return normd

In [55]:
def has_ccomp(tokenized):
    has = any([t.dep_=='ccomp' for t in tokenized])
    return 'TRUE' if has else 'FALSE'

In [11]:
def get_verbs(tokenized):
    verbs = ', '.join(set(t.lemma_ for t in tokenized if (t.pos_=='VERB') and (t.lemma_ in es_verbs)))
    return verbs if len(verbs)>0 else None

In [40]:
def get_dep(tokenized):
    deps = ''
    
    for t in tokenized:
        if t.pos_=='PUNCT':
            deps+=f' {t.text}'
            continue
        
        deps+=f' {t.text.lower()}[{t.dep_}]'
    
    return deps

In [13]:
def get_pos(tokenized):
    return ' '.join([f'{t.text}({t.pos_.upper()})' for t in tokenized if t.pos_!='PUNCT'])

In [41]:
def get_details(tokenized):
    return ' '.join([f'<{t.text}>({t.lemma_.lower()},{t.is_stop})' for t in tokenized if t.pos_!='PUNCT'])

In [15]:
def have_verbs(df):
    have = df['CONCORDANCIA'].apply(get_verbs).notna()
    return df.loc[have, :].reset_index(drop=True)

In [25]:
def save_batch(tokenized: list, file_path, file_name):
    batch = have_verbs(pd.concat(tokenized, ignore_index=False))
    
    verbs = batch['CONCORDANCIA'].apply(get_verbs).rename('verbs')
    normd = batch['CONCORDANCIA'].apply(get_normd).rename('normalized')
    ccomp = batch['CONCORDANCIA'].apply(has_ccomp).rename('has_ccomp')
    dep = batch['CONCORDANCIA'].apply(get_dep).rename('dependencies')
    pos = batch['CONCORDANCIA'].apply(get_pos).rename('pos')
    details = batch['CONCORDANCIA'].apply(get_details).rename('details')
    
    batch = pd.concat([verbs, batch['CONCORDANCIA'], normd, ccomp, dep, pos, details], axis=1)
    
    utils.save_csv(file_path, batch, file_name+'.csv')

In [41]:
utils.make_dir(processed_path)

PosixPath('/home/rimov/Documents/Code/NLP/lin-que-dropping/processing/../processing/saved/corpes/es/12062021')

In [54]:
finished = set()

In [72]:
finished

{'rogar', 'solicitar', 'suplicar'}

In [75]:
for verb in cleaned_path.iterdir():
    if verb.stem in finished:
        continue 
        
    save_path = processed_path/verb.stem
    utils.make_dir(save_path)
    
    for file in verb.iterdir():
        file_name = file.stem.lower()+'-processed'
        print(file_name)
        
        data = utils.get_csv(data_from='corpes', path=file, sep='\t')
        
        procd = [pd.concat([data.iloc[:, :-1], 
                            data.loc[:, 'CONCORDANCIA'].apply(nlp_es)], 
                           axis=1)]
        save_batch(procd, file_path=save_path, file_name=file_name)
    
    finished.add(verb.stem)
    print(f'Processed: {verb.stem}')

esperar-2051-corpes-2-7-21-sk-processed


b'Skipping line 5: expected 14 fields, saw 27\n'


esperar-2501-corpes-2-7-21-sk-processed


b'Skipping line 301: expected 14 fields, saw 27\n'


esperar-2601-corpes-2-7-21-sk-processed
esperar-1-corpes-2-7-21-sk-processed
esperar-1301-corpes-2-7-21-sk-processed
esperar-1451-corpes-2-7-21-sk-processed


b'Skipping line 148: expected 14 fields, saw 27\n'


esperar-2151-corpes-2-7-21-sk-processed


b'Skipping line 869: expected 14 fields, saw 27\n'


esperar-3651-corpes-2-7-21-sk-processed


b'Skipping line 889: expected 14 fields, saw 27\nSkipping line 962: expected 14 fields, saw 27\n'


esperar-4751-corpes-2-7-21-sk-processed
esperar-4701-corpes-2-7-21-sk-processed


b'Skipping line 30: expected 14 fields, saw 27\n'


esperar-3951-corpes-2-7-21-sk-processed


b'Skipping line 223: expected 14 fields, saw 27\n'


esperar-4451-corpes-2-7-21-sk-processed
esperar-3801-corpes-2-7-21-sk-processed
esperar-3551-corpes-2-7-21-sk-processed
esperar-4351-corpes-2-7-21-sk-processed


b'Skipping line 624: expected 14 fields, saw 27\n'


esperar-1751-corpes-2-7-21-sk-processed


b'Skipping line 833: expected 14 fields, saw 27\n'


esperar-4651-corpes-2-7-21-sk-processed


b'Skipping line 591: expected 14 fields, saw 27\n'


esperar-3601-corpes-2-7-21-sk-processed
esperar-951-corpes-2-7-21-sk-processed
esperar-1501-corpes-2-7-21-sk-processed
esperar-851-corpes-2-7-21-sk-processed
esperar-101-corpes-2-7-21-sk-processed
esperar-1351-corpes-2-7-21-sk-processed
esperar-3101-corpes-2-7-21-sk-processed


b'Skipping line 74: expected 14 fields, saw 27\nSkipping line 668: expected 14 fields, saw 40\n'


esperar-1201-corpes-2-7-21-sk-processed
esperar-2551-corpes-2-7-21-sk-processed
esperar-4951-corpes-2-7-21-sk-processed
esperar-501-corpes-2-7-21-sk-processed
esperar-1401-corpes-2-7-21-sk-processed


b'Skipping line 571: expected 14 fields, saw 27\nSkipping line 985: expected 14 fields, saw 27\n'


esperar-3351-corpes-2-7-21-sk-processed
esperar-651-corpes-2-7-21-sk-processed
esperar-351-corpes-2-7-21-sk-processed
esperar-1801-corpes-2-7-21-sk-processed
esperar-3201-corpes-2-7-21-sk-processed
esperar-2451-corpes-2-7-21-sk-processed


b'Skipping line 717: expected 14 fields, saw 27\n'


esperar-1151-corpes-2-7-21-sk-processed
esperar-1901-corpes-2-7-21-sk-processed
esperar-1701-corpes-2-7-21-sk-processed


b'Skipping line 299: expected 14 fields, saw 27\n'


esperar-4551-corpes-2-7-21-sk-processed
esperar-3251-corpes-2-7-21-sk-processed
esperar-2801-corpes-2-7-21-sk-processed


b'Skipping line 289: expected 14 fields, saw 27\nSkipping line 398: expected 14 fields, saw 27\n'


esperar-401-corpes-2-7-21-sk-processed
esperar-901-corpes-2-7-21-sk-processed


ValueError: Integer column has NA values in column 3

In [76]:
sample = utils.get_csv('corpes', processed_path/verb.stem/'esperar-1-corpes-2-7-21-sk-processed.csv')
sample.head(2)

Unnamed: 0,verbs,CONCORDANCIA,dependencies,lemma_pos_stopword
0,"esperar, contar",Nancy Ramos es una niña que necesita un transp...,Nancy[nsubj] Ramos[flat] es[cop] una[det] niña...,"<<Nancy>>(Nancy,PROPN,False) <<Ramos>>(Ramos,P..."
1,"esperar, querer","No. Yo quiero, amo y espero lo mismo.","No[ROOT] .[] Yo[nsubj] quiero[ROOT] ,[] amo[ad...","<<No>>(no,ADV,True) <<Yo>>(yo,PRON,True) <<qui..."


In [79]:
has_ccomp = sample['dependencies'].str.contains('ccomp')
sample = pd.concat([sample, has_ccomp], axis=1)
sample.head()

Unnamed: 0,verbs,CONCORDANCIA,dependencies,lemma_pos_stopword,dependencies.1
0,"esperar, contar",Nancy Ramos es una niña que necesita un transp...,Nancy[nsubj] Ramos[flat] es[cop] una[det] niña...,"<<Nancy>>(Nancy,PROPN,False) <<Ramos>>(Ramos,P...",False
1,"esperar, querer","No. Yo quiero, amo y espero lo mismo.","No[ROOT] .[] Yo[nsubj] quiero[ROOT] ,[] amo[ad...","<<No>>(no,ADV,True) <<Yo>>(yo,PRON,True) <<qui...",False
2,esperar,"Mañana mismo voy a Valencia, a la gala de Miss...",Mañana[advmod] mismo[advmod] voy[ROOT] a[case]...,"<<Mañana>>(mañana,ADV,False) <<mismo>>(mismo,A...",False
3,esperar,La preparo este año y el próximo año daré unos...,La[obj] preparo[ROOT] este[det] año[obl] y[cc]...,"<<La>>(él,PRON,True) <<preparo>>(preparar,VERB...",False
4,esperar,16. ¿Qué espera de esta nueva novela?,16[ROOT] .[] ¿[] Qué[obj] espera[ROOT] de[case...,"<<16>>(16,NUM,False) <<Qué>>(qué,PRON,True) <<...",False


In [78]:
utils.save_excel(test_path, sample, 'esperar-1-corpes-2-7-21-sk-processed.csv')

In [80]:
spacy.explain('PROPN')

'proper noun'

### TEMP

In [28]:
test_path = utils.get_save_path('c', 'corpes', lang='es', is_test=False)/'12062021'/'esperar'
test_file = 'ESPERAR-1-CORPES-2-7-21-SK.txt'

In [58]:
data = utils.get_csv('corpes', test_path/test_file, sep='\t')
data = data.reset_index()
data = data.rename({'index': 'id'}, axis=1)

data.head()

Unnamed: 0,id,BIBLIOGRAFÍA,AUTOR,TÍTULO,FECHA,CRITERIO,BLOQUE,MEDIO,SOPORTE,TEMA,PAÍS,ZONA,TIPOLOGÍA,NOTAS,CONCORDANCIA
0,0,Elmundo.es. Encuentro digital con Chayanne. ww...,,Elmundo.es. Encuentro digital con Chayanne,2001,Fecha de escritura,No ficción,Escrito,Web,"Artes, cultura y espectáculos",Puerto Rico,Antillas,Entrevista digital,,Nancy Ramos es una niña que necesita un transp...
1,1,Elmundo.es. Encuentro digital con Chayanne. ww...,,Elmundo.es. Encuentro digital con Chayanne,2001,Fecha de escritura,No ficción,Escrito,Web,"Artes, cultura y espectáculos",Puerto Rico,Antillas,Entrevista digital,,"No. Yo quiero, amo y espero lo mismo."
2,2,Elmundo.es. Encuentro digital con Chayanne. ww...,,Elmundo.es. Encuentro digital con Chayanne,2001,Fecha de escritura,No ficción,Escrito,Web,"Artes, cultura y espectáculos",Puerto Rico,Antillas,Entrevista digital,,"Mañana mismo voy a Valencia, a la gala de Miss..."
3,3,Elmundo.es. Encuentro digital con Chayanne. ww...,,Elmundo.es. Encuentro digital con Chayanne,2001,Fecha de escritura,No ficción,Escrito,Web,"Artes, cultura y espectáculos",Puerto Rico,Antillas,Entrevista digital,,La preparo este año y el próximo año daré unos...
4,4,Elmundo.es. Encuentro digital con Laura Esquiv...,,Elmundo.es. Encuentro digital con Laura Esquivel,2001,Fecha de escritura,No ficción,Escrito,Web,"Artes, cultura y espectáculos",México,México y Centroamérica,Entrevista digital,,16. ¿Qué espera de esta nueva novela?


In [59]:
def save_batch(tokenized: list, file_path, file_name):
    batch = have_verbs(pd.concat(tokenized, ignore_index=False))
    
    verbs = batch['CONCORDANCIA'].apply(get_verbs).rename('verbs')
    normd = batch['CONCORDANCIA'].apply(get_normd).rename('normalized')
    ccomp = batch['CONCORDANCIA'].apply(has_ccomp).rename('has_ccomp')
    dep = batch['CONCORDANCIA'].apply(get_dep).rename('dependencies')
    pos = batch['CONCORDANCIA'].apply(get_pos).rename('pos')
    details = batch['CONCORDANCIA'].apply(get_details).rename('details')
    
    batch = pd.concat([batch['id'], verbs, batch['CONCORDANCIA'], normd, ccomp, dep, pos, details], axis=1)
    
    utils.save_excel(file_path, batch, file_name+'.xlsx')

In [31]:
save_path = utils.get_save_path('p', 'corpes', is_test=True)
file_name = (save_path/test_file).stem.lower()

In [60]:
procd = [pd.concat([data.iloc[:, :-1], 
                    data.loc[:, 'CONCORDANCIA'].apply(nlp_es)], 
                    axis=1)]
save_batch(procd, file_path=save_path, file_name=file_name)