In [48]:
import pandas as pd
import regex
from collections import Counter
import spacy
spacy.require_gpu()
from torch.utils import dlpack

In [2]:
import importlib
import sys

spec_src = importlib.util.spec_from_file_location(
    'src', 
    '../../__init__.py')
m = importlib.util.module_from_spec(spec_src)
sys.modules[spec_src.name] = m
spec_src.loader.exec_module(m)

from src import utils

In [3]:
conf = utils.get_config('p')

In [8]:
"""
Reload module
"""
importlib.reload(utils)

<module 'src.utils' from '/home/rimov/Documents/Code/NLP/lin-que-dropping/src/analysis/processing/../../utils.py'>

In [4]:
data_path = utils.get_save_path('p', 'twitter', lang='es')/'2022-03-08 15:15:06'/'combined-2021-07-26-and-11-07'/'epistemic'/'twitter-es-acordar-0-477.xlsx'
data = pd.read_excel(data_path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477 entries, 0 to 476
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   verbs         477 non-null    object 
 1   tweet_id      477 non-null    float64
 2   text_orig     477 non-null    object 
 3   normalized    477 non-null    object 
 4   has_ccomp     477 non-null    bool   
 5   dependencies  477 non-null    object 
 6   pos           477 non-null    object 
 7   details       477 non-null    object 
dtypes: bool(1), float64(1), object(6)
memory usage: 26.7+ KB


In [5]:
data['pos'].head()

0    @Gerardoquinte(PROPN) @Noti90Minutos(PROPN) Ah...
1    @SnailArg(PROPN) No(ADV) estaría(AUX) tan(ADV)...
2    @mywordsworlds1(PROPN) No(ADV) sabes(VERB) la(...
3    @BrissioMauro(PROPN) La(DET) verdad(NOUN) me(P...
4    este(DET) año(NOUN) no(ADV) pienso(VERB) habla...
Name: pos, dtype: object

In [32]:
pos_pat = r'(?P<obj>.+)\((?P<pos>\b\w+\b)\)'

In [45]:
def separate_by_pos(text):
    """
    Split POS-tagged text into a dictionary of token:POS pairs
    """
    
    if isinstance(text, pd.Series):
        text = text.values[0].split()
    elif isinstance(text, str):
        text = text.split()
        
    separated = dict()
    
    for i, word in enumerate(text):
        match = re.search(pos_pat, word)
        
        if match is not None:
            separated[match['obj']] = match['pos']
        else:
            # regex couldn't find a match, add original word as key and None as value
            separated[text[i]] = None
    
    return separated

### TODO: improve by checking the word's lemma to ensure any 'AUX' is a verb

In [61]:
def get_consecutive_verbs(text):
    """
    
    """
    
    text_pos = separate_by_pos(text)
    
    # Holder for group of consecutive words of interest
    group = []
    pos_counter = Counter()
    
    # All consecutive verbs
    all_consec = []
    
    for token, pos in text_pos.items():
        if pos!='AUX' and pos!='VERB' and pos!='ADP':
            consec_verbs = pos_counter['AUX'] + pos_counter['VERB']
            
            if consec_verbs >= 2:
                all_consec.append(group)
            
            group = []
            pos_counter.clear()
            continue
            
        if (pos=='ADP') and (pos_counter['AUX'] > 0 or pos_counter['VERB'] > 0):
            group.append(token)
            pos_counter[pos]+=1
            
        elif pos=='VERB' or pos=='AUX':
            group.append(token)
            pos_counter[pos]+=1
        
    
    return len(all_consec), all_consec

### TODO: add default idx

In [62]:
def test_func(func, idx, correct, **params):
    """
    Test a function for correct output
    
    @func: function to test
    @idx: index of function output to use; !!! use (-1) for single output !!!
    """
    out = func(**params)
    
    print(f'Function {func.__name__}(), returned: \n\t{out}')
    print(f'Desired output: \n\t{correct}')
    
    if idx != -1:
        out = func(**params)[idx]
    
    return out == correct

In [55]:
test_input = data.sample(5, random_state=1).loc[:, ['tweet_id', 'text_orig', 'pos', 'details']]
test_input.head()

Unnamed: 0,tweet_id,text_orig,pos,details
440,1.455554e+18,Se acuerdan del 6% que seguía defendiendo a Pi...,Se(PRON) acuerdan(VERB) del(ADP) 6%(SYM) que(P...,"<Se>(él,True) <acuerdan>(acordar,False) <del>(..."
404,1.455583e+18,@ClaraGrima @matematicasUS qué bonito! me acor...,@ClaraGrima(PROPN) @matematicasUS(PROPN) qué(D...,"<@ClaraGrima>(@claragrima,False) <@matematicas..."
34,1.453414e+18,Ya son pocos los que se acuerdan ..,Ya(ADV) son(AUX) pocos(PRON) los(DET) que(PRON...,"<Ya>(ya,True) <son>(ser,True) <pocos>(poco,Tru..."
331,1.455883e+18,@Jules20241 Buen punto. Entiendo q la izq esta...,@Jules20241(PROPN) Buen(ADJ) punto(NOUN) .(PUN...,"<@Jules20241>(@jules20241,False) <Buen>(buen,T..."
428,1.455561e+18,@pepitosincuenta @ArtolaMaru Tenemos que ser r...,@pepitosincuenta(ADJ) @ArtolaMaru(PROPN) Tenem...,"<@pepitosincuenta>(@pepitosincuenta,False) <@A..."


In [56]:
with pd.option_context('display.max_colwidth', None):
    display(test_input['text_orig'])
    display(test_input['pos'])

440                                                                                                                                                                                Se acuerdan del 6% que seguía defendiendo a Piñera cuando saqueaban el país entero y el comía pizza? El mismo 6% que apoya al Cheezels
404                                                                                                                                                                   @ClaraGrima @matematicasUS qué bonito! me acordaste de cuando mi papá me regaló mi primer libro de estadística (era más largo que la biblia jaja) 😊
34                                                                                                                                                                                                                                                                                    Ya son pocos los que se acuerdan ..
331                                                   @Jul

440                                                                                                                                                                                                                                                                                                                                                                                      Se(PRON) acuerdan(VERB) del(ADP) 6%(SYM) que(PRON) seguía(VERB) defendiendo(VERB) a(ADP) Piñera(PROPN) cuando(SCONJ) saqueaban(VERB) el(DET) país(NOUN) entero(ADJ) y(CCONJ) el(DET) comía(VERB) pizza(NOUN) ?(PUNCT) El(DET) mismo(DET) 6%(SYM) que(PRON) apoya(VERB) al(ADP) Cheezels(PROPN)
404                                                                                                                                                                                                                                                                                                                                             

In [57]:
correct_out = [1, 0, 0, 1, 2]

In [63]:
for i, t in enumerate(test_input.iterrows()):
    entry = t[1]
    
    print(f'!!!Entry {entry["tweet_id"]}!!!')
    print(test_func(get_consecutive_verbs, 0, correct=correct_out[i], text=entry['pos']))
    print('\n')

!!!Entry 1.455554113719046e+18!!!
Function get_consecutive_verbs(), returned: 
	(1, [['seguía', 'defendiendo', 'a']])
Desired output: 
	1
True


!!!Entry 1.455582584415601e+18!!!
Function get_consecutive_verbs(), returned: 
	(0, [])
Desired output: 
	0
True


!!!Entry 1.453413849474814e+18!!!
Function get_consecutive_verbs(), returned: 
	(0, [])
Desired output: 
	0
True


!!!Entry 1.455882760078377e+18!!!
Function get_consecutive_verbs(), returned: 
	(1, [['puede', 'querer', 'modificar']])
Desired output: 
	1
True


!!!Entry 1.455561244421751e+18!!!
Function get_consecutive_verbs(), returned: 
	(2, [['habrás', 'acordado', 'de'], ['siguen', 'pensando', 'en']])
Desired output: 
	2
True


