### References
Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.

https://www.kaggle.com/alxmamaev/how-to-easy-preprocess-russian-text

https://python-school.ru/nlp-text-preprocessing/

https://pymorphy2.readthedocs.io/en/latest/user/guide.html

https://stackoverflow.com/a/49242754/13557629 (finding emojis)

https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#wordnetlemmatizer

In [1]:
import json
import regex
import yaml
from unidecode import unidecode
import logging
from logging import config
import numpy as np 
import pandas as pd
from datetime import datetime as dt
from time import sleep
from pathlib import Path
import processing

import spacy
spacy.require_gpu()
from torch.utils import dlpack

In [2]:
import importlib
import sys

spec_src = importlib.util.spec_from_file_location(
    'src', 
    '../../__init__.py')
m = importlib.util.module_from_spec(spec_src)
sys.modules[spec_src.name] = m
spec_src.loader.exec_module(m)

from src import utils

In [3]:
desc = 'Processed format agreed on; process all the tweets'
logger = utils.get_logger('process-all-tweets', desc=desc)

INFO:root:Opened config file at: /home/rimov/Documents/Code/NLP/lin-que-dropping/config/general_config.yml


In [4]:
"""
Reload module
"""
importlib.reload(utils)

<module 'src.utils' from '/home/rimov/Documents/Code/NLP/lin-que-dropping/src/analysis/processing/../../utils.py'>

In [5]:
gen_conf = utils.get_config()
conf = utils.get_config('p')

INFO:root:Opened config file at: /home/rimov/Documents/Code/NLP/lin-que-dropping/config/general_config.yml
INFO:root:Opened config file at: /home/rimov/Documents/Code/NLP/lin-que-dropping/config/processing_config.yml


In [6]:
es_conj_path = utils.get_project_root()/gen_conf['file_paths']['verb_conjug']
es_conjugs = pd.read_excel(es_conj_path)
display(es_conjugs.head(2))

es_verbs = set(es_conjugs['verb'].to_numpy())

Unnamed: 0,verb_type,verb,indicativo,imperativo,subjuntivo,gerundio,gerundio_compuesto,infinitivo,infinitivo_compuesto,participio_pasado
0,Stative,ver,veía visto verías vi vimos verían ves v...,vean ve vea veamos ved,veáis visto vieras vieren viesen veas vi...,viendo,visto,ver,visto,visto
1,Stative,jurar,jurarán juramos jurarías jurabas juraría ...,jurad jura juren jure juremos,jurare jurareis jurase jurara juraren jur...,jurando,jurado,jurar,jurado,jurado


In [11]:
std_path_c = utils.get_save_path('c', 'twitter', lang='es', is_test=False)
std_path_p = utils.get_save_path('p', 'twitter', lang='es', is_test=False)
data_folders = ['07112021-at-2210-combined', '20210726-combined']
data_paths = [std_path_c/path for path in data_folders]

In [17]:
# Save folders below are partitioned by date folders (when processing done)
save_date = utils.get_str_datetime_now()
# Name of folder in which to save data
save_folder = 'combined-2021-07-26-and-11-07'

INFO:root:Opened config file at: /home/rimov/Documents/Code/NLP/lin-que-dropping/config/general_config.yml


In [18]:
save_path = utils.make_dir(std_path_p, save_folder, save_date)

In [14]:
tweets = [utils.get_csv('twitter', data_path/'tweets.csv', sep='~', lineterminator='\n') for data_path in data_paths]
print(f'Dataframes opened: {len(tweets)}')

for t in tweets:
    display(t.head(2))
    display(t.info())

INFO:root:Opened config file at: /home/rimov/Documents/Code/NLP/lin-que-dropping/config/general_config.yml
INFO:root:Opened config file at: /home/rimov/Documents/Code/NLP/lin-que-dropping/config/general_config.yml


Dataframes opened: 2


Unnamed: 0,created_at,text_orig,author_id,lang,tweet_id,tweet_place_id,referenced_tweets,mentions,text_norm,retweet_reply_like_quote
0,2021-11-08 03:15:45+00:00,Esta derrota de Quindio confirma que el Superd...,141323312.0,es,1.457547e+18,0116b409205a5237,,,Esta derrota de Quindio confirma que el Superd...,"(0, 0, 4, 0)"
1,2021-11-08 03:15:11+00:00,Muajaja ese broder confirmó lo q les dije... L...,49454158.0,es,1.457547e+18,011455904ec2ab81,,,Muajaja ese broder confirmo lo q les dije... L...,"(0, 0, 0, 0)"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 408362 entries, 0 to 408361
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   created_at                408362 non-null  object 
 1   text_orig                 408362 non-null  string 
 2   author_id                 408362 non-null  float64
 3   lang                      408362 non-null  object 
 4   tweet_id                  408362 non-null  float64
 5   tweet_place_id            408358 non-null  string 
 6   referenced_tweets         253215 non-null  string 
 7   mentions                  259972 non-null  string 
 8   text_norm                 408362 non-null  string 
 9   retweet_reply_like_quote  408362 non-null  string 
dtypes: float64(2), object(2), string(6)
memory usage: 31.2+ MB


None

Unnamed: 0,referenced_tweets,lang,text_orig,author_id,tweet_id,created_at,tweet_place_id,mentions,text_norm,retweet_reply_like_quote
0,['1455906589513293832'],es,@elguisodebagre Pero es en todo.... sinó mira ...,166276984,1.455907e+18,2021-11-03 14:38:30+00:00,0a738ff13a08a7dd,"[{'start': 0, 'end': 15, 'username': 'elguisod...","Pero es en todo.... sino mira un ex club, que ...","(0, 0, 0, 0)"
1,,es,La #CC no ha escrito aún 1 carilla dl proyecto...,1442589489872834569,1.455907e+18,2021-11-03 14:38:11+00:00,014f394f11cda9e4,,La #CC no ha escrito aun 1 carilla dl proyecto...,"(0, 0, 0, 0)"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116025 entries, 0 to 116024
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   referenced_tweets         77510 non-null   string 
 1   lang                      116025 non-null  object 
 2   text_orig                 116025 non-null  string 
 3   author_id                 116025 non-null  int64  
 4   tweet_id                  116025 non-null  float64
 5   created_at                116025 non-null  object 
 6   tweet_place_id            116020 non-null  string 
 7   mentions                  79994 non-null   string 
 8   text_norm                 116025 non-null  string 
 9   retweet_reply_like_quote  116025 non-null  string 
dtypes: float64(1), int64(1), object(2), string(6)
memory usage: 8.9+ MB


None

In [20]:
sample1 = tweets[0].sample(5)
sample2 = tweets[1].sample(5)

sample = pd.concat([sample1, sample2])
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 98417 to 77663
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   created_at                10 non-null     object 
 1   text_orig                 10 non-null     string 
 2   author_id                 10 non-null     float64
 3   lang                      10 non-null     object 
 4   tweet_id                  10 non-null     float64
 5   tweet_place_id            10 non-null     string 
 6   referenced_tweets         8 non-null      string 
 7   mentions                  8 non-null      string 
 8   text_norm                 10 non-null     string 
 9   retweet_reply_like_quote  10 non-null     string 
dtypes: float64(2), object(2), string(6)
memory usage: 880.0+ bytes


In [21]:
if len(tweets) > 1:
    tweets = pd.concat(tweets)
    tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 524387 entries, 0 to 116024
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   created_at                524387 non-null  object 
 1   text_orig                 524387 non-null  string 
 2   author_id                 524387 non-null  float64
 3   lang                      524387 non-null  object 
 4   tweet_id                  524387 non-null  float64
 5   tweet_place_id            524378 non-null  string 
 6   referenced_tweets         330725 non-null  string 
 7   mentions                  339966 non-null  string 
 8   text_norm                 524387 non-null  string 
 9   retweet_reply_like_quote  524387 non-null  string 
dtypes: float64(2), object(2), string(6)
memory usage: 44.0+ MB


In [25]:
tweets = tweets.drop(columns=['text_norm', 'lang'])

In [26]:
tweets = tweets.drop_duplicates(subset='tweet_id', ignore_index=True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510186 entries, 0 to 510185
Data columns (total 8 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   created_at                510186 non-null  object 
 1   text_orig                 510186 non-null  string 
 2   author_id                 510186 non-null  float64
 3   tweet_id                  510186 non-null  float64
 4   tweet_place_id            510177 non-null  string 
 5   referenced_tweets         321333 non-null  string 
 6   mentions                  330252 non-null  string 
 7   retweet_reply_like_quote  510186 non-null  string 
dtypes: float64(2), object(1), string(5)
memory usage: 31.1+ MB


### Running through spaCy pipeline

In [23]:
# Disable 'ner' (Named Entity Recognizer)
nlp_es = spacy.load(conf['spacy']['es'], disable=['ner'])

In [38]:
sample = tweets.sample(100, random_state=1)
sample.head()

Unnamed: 0,created_at,text_orig,author_id,tweet_id,tweet_place_id,referenced_tweets,mentions,retweet_reply_like_quote
411680,2021-11-05 05:11:21+00:00,@carlesenric @Macarena_Olona @ElsaGarciad no c...,255668300.0,1.456489e+18,cbdb0e7018443220,['1456402105124675586'],"[{'start': 0, 'end': 12, 'username': 'carlesen...","(0, 0, 7, 0)"
29860,2021-11-05 15:55:58+00:00,Coño vi Dune y me pareció un Star Wars aburrid...,69178590.0,1.456652e+18,01a9a39529b27f36,,,"(0, 0, 2, 0)"
235543,2021-10-30 17:57:49+00:00,"Mi cuerpo me odia , onda me siento como el ogt...",1367898000.0,1.454508e+18,0108c69f708ae783,,,"(0, 0, 0, 0)"
212760,2021-10-28 19:47:00+00:00,Me acuerdo de ese día y la cachetada que lleva...,8.200432e+17,1.453811e+18,0016b0ca4701a899,,,"(0, 0, 0, 0)"
498199,2021-11-05 00:20:11+00:00,@leuryma16 Dato las estrellas en hollywood tu ...,52854080.0,1.456416e+18,01fcc4a23f17e1ed,['1456413208034652163'],"[{'start': 0, 'end': 10, 'username': 'leuryma1...","(0, 1, 1, 0)"


In [28]:
def get_normd(tokenized):
    normd = ''
    
    for t in tokenized:
        token = unidecode(t.text).lower()
        
        if t.pos_=='PUNCT':
            normd+=f'{t.text}'
            continue
        
        if token=='que' or token=='q':
            que = t.text.upper()
            normd+=f' {que}'
            continue
        
        if (t.pos_=='VERB') and (t.lemma_ in es_verbs):
            verb = t.text.upper()
            if t.dep_=='ccomp':
                verb = f'<<{verb}>>'
                
            normd+=f' {verb}'
            continue
        
        normd+=f' {t.text.lower()}'
    
    return normd

In [29]:
def has_ccomp(tokenized):
    has = any([t.dep_=='ccomp' for t in tokenized])
    return 'TRUE' if has else 'FALSE'

In [31]:
def get_dep(tokenized):
    deps = ''
    
    for t in tokenized:
        if t.pos_=='PUNCT':
            deps+=f' {t.text}'
            continue
        
        deps+=f' {t.text.lower()}[{t.dep_}]'
    
    return deps

In [32]:
def get_pos(tokenized):
    return ' '.join([f'{t.text}({t.pos_.upper()})' for t in tokenized])

In [33]:
def get_details(tokenized):
    return ' '.join([f'<{t.text}>({t.lemma_.lower()},{t.is_stop})' for t in tokenized if t.pos_!='PUNCT'])

In [81]:
def get_verbs(tokenized):
    verbs = ', '.join(set(t.lemma_ for t in tokenized if (t.pos_=='VERB') and (t.lemma_ in es_verbs)))
    return verbs if len(verbs)>0 else None

In [34]:
def have_verbs(df):
    have = df['text_orig'].apply(get_verbs).notna()
    return df.loc[have, :].reset_index(drop=True)

In [83]:
def save_batch(tokenized: list, file_path, file_name):
    batch = have_verbs(pd.concat(tokenized, ignore_index=False))
    
    verbs = batch['text_orig'].apply(get_verbs).rename('verbs')
    normd = batch['text_orig'].apply(get_normd).rename('normalized')
    ccomp = batch['text_orig'].apply(has_ccomp).rename('has_ccomp')
    dep = batch['text_orig'].apply(get_dep).rename('dependencies')
    pos = batch['text_orig'].apply(get_pos).rename('pos')
    details = batch['text_orig'].apply(get_details).rename('details')
    
    batch = pd.concat([verbs, batch.loc[:, ['tweet_id', 'text_orig']], normd, ccomp, dep, pos, details], axis=1)
    
    utils.save_csv(file_path, batch, file_name+'.csv')
#     utils.save_excel(file_path, batch, file_name+'.xlsx')

In [60]:
sample = tweets.sample(1000, random_state=1)
sample.head()

Unnamed: 0,created_at,text_orig,author_id,tweet_id,tweet_place_id,referenced_tweets,mentions,retweet_reply_like_quote
411680,2021-11-05 05:11:21+00:00,@carlesenric @Macarena_Olona @ElsaGarciad no c...,255668300.0,1.456489e+18,cbdb0e7018443220,['1456402105124675586'],"[{'start': 0, 'end': 12, 'username': 'carlesen...","(0, 0, 7, 0)"
29860,2021-11-05 15:55:58+00:00,Coño vi Dune y me pareció un Star Wars aburrid...,69178590.0,1.456652e+18,01a9a39529b27f36,,,"(0, 0, 2, 0)"
235543,2021-10-30 17:57:49+00:00,"Mi cuerpo me odia , onda me siento como el ogt...",1367898000.0,1.454508e+18,0108c69f708ae783,,,"(0, 0, 0, 0)"
212760,2021-10-28 19:47:00+00:00,Me acuerdo de ese día y la cachetada que lleva...,8.200432e+17,1.453811e+18,0016b0ca4701a899,,,"(0, 0, 0, 0)"
498199,2021-11-05 00:20:11+00:00,@leuryma16 Dato las estrellas en hollywood tu ...,52854080.0,1.456416e+18,01fcc4a23f17e1ed,['1456413208034652163'],"[{'start': 0, 'end': 10, 'username': 'leuryma1...","(0, 1, 1, 0)"


In [None]:
keep_cols = ['tweet_id', 'created_at', 'author_id', 'tweet_place_id']

batch_size = 500
batches = int(np.ceil(tweets.shape[0]/batch_size))

for i, data in enumerate(np.array_split(tweets.loc[:, ['tweet_id', 'text_orig', 'created_at', 'author_id', 'tweet_place_id']], batches)):
    file_name = f'tweets-processed-{i}'
    
    procd = [pd.concat([data.loc[:, keep_cols], 
                    data.loc[:, 'text_orig'].apply(nlp_es)], 
                    axis=1)]
    save_batch(procd, file_path=save_path, file_name=file_name)

INFO:root:Saved dataframe (tweets-processed-0.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-1.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-2.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-3.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-4.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-5

INFO:root:Saved dataframe (tweets-processed-43.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-44.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-45.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-46.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-47.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-proces

INFO:root:Saved dataframe (tweets-processed-86.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-87.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-88.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-89.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-90.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-proces

INFO:root:Saved dataframe (tweets-processed-129.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-130.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-131.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-132.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-133.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-p

INFO:root:Saved dataframe (tweets-processed-172.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-173.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-174.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-175.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-176.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-p

INFO:root:Saved dataframe (tweets-processed-215.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-216.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-217.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-218.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-processed-219.csv) CSV into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-03-08 15:15:06/combined-2021-07-26-and-11-07
INFO:root:Saved dataframe (tweets-p

### Merging Processed Batches

In [18]:
importlib.reload(utils)

<module 'twitter_connection.util.utils' from '/home/rimov/Documents/Code/NLP/lin-que-dropping/processing/../twitter-connection/util/utils.py'>

In [19]:
processed_tweets_path = Path(es_save_path).rglob('*processed*.csv')

In [25]:
processed_tweets = pd.concat([utils.get_csv(p) for p in processed_tweets_path]).reset_index(drop=True)

ValueError: No objects to concatenate

In [38]:
spacy_processed = []
processed = 0
saved = 0

In [None]:
save_batch(spacy_processed, save_path, f'tweets-processed-{saved}')
saved+=1
            
spacy_processed.clear()

In [26]:
processed_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298492 entries, 0 to 298491
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   verbs               298492 non-null  string 
 1   tweet_id            298492 non-null  float64
 2   dependencies        298492 non-null  string 
 3   lemma_pos_stopword  298492 non-null  string 
dtypes: float64(1), string(3)
memory usage: 9.1 MB


In [None]:
err = 0
batch_size = 500
batches = int(np.ceil(tweets.shape[0]/batch_size))

logger.info(f'Running {tweets.shape[0]-processed} tweets through spaCy pipeline')
logger.debug(f'Batch size: {batch_size}, batches: {batches}')

for i, d in enumerate(np.array_split(tweets.loc[:, ['tweet_id', 'text_orig']], batches)):
    # Tweets already processed
    if i*batch_size < processed:
        continue
    
    try:
        spacy_processed.append(
            pd.concat([d['tweet_id'], d['text_norm'].apply(nlp_es)], 
                      axis=1))
        
        processed+=batch_size
        logger.debug(f'Processed: {processed}')
        
        if (processed%10000)<batch_size:
            logger.debug(f'Saving batch of {sum([p.shape[0] for p in spacy_processed])}')
            # Save progress and free up memory
            save_batch(spacy_processed, name=f'tweets-processed-{saved}')
            saved+=1
            
            spacy_processed.clear()

    except Exception as e:
        err+=1
        print(f'{i} is broken: {e.args}')
        
        if err>2:
            break
        pass

In [42]:
merged = pd.merge(processed_tweets, tweets, how='left', on='tweet_id')
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 298492 entries, 0 to 298491
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   verbs                     298492 non-null  string 
 1   tweet_id                  298492 non-null  float64
 2   dependencies              298492 non-null  string 
 3   lemma_pos_stopword        298492 non-null  string 
 4   created_at                298492 non-null  object 
 5   text_orig                 298492 non-null  object 
 6   author_id                 298492 non-null  float64
 7   lang                      298492 non-null  object 
 8   tweet_place_id            298491 non-null  object 
 9   referenced_tweets         180174 non-null  object 
 10  mentions                  185057 non-null  object 
 11  text_norm                 298492 non-null  object 
 12  retweet_reply_like_quote  298492 non-null  object 
dtypes: float64(2), object(8), string(3)
memory u

In [43]:
# Rename any misnamed columns
merged.rename(columns={'author_id': 'user_id'}, inplace=True)

In [44]:
merged = merged.loc[:, conf['col_order']]
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 298492 entries, 0 to 298491
Data columns (total 12 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tweet_id                  298492 non-null  float64
 1   verbs                     298492 non-null  string 
 2   text_orig                 298492 non-null  object 
 3   text_norm                 298492 non-null  object 
 4   dependencies              298492 non-null  string 
 5   lemma_pos_stopword        298492 non-null  string 
 6   retweet_reply_like_quote  298492 non-null  object 
 7   created_at                298492 non-null  object 
 8   user_id                   298492 non-null  float64
 9   tweet_place_id            298491 non-null  object 
 10  mentions                  185057 non-null  object 
 11  referenced_tweets         180174 non-null  object 
dtypes: float64(2), object(7), string(3)
memory usage: 29.6+ MB


In [45]:
utils.save_csv(es_save_path, merged, 'tweets-processed-combined')

### Breaking Up by Verb

In [16]:
importlib.reload(processing)

<module 'processing' from '/home/rimov/Documents/Code/NLP/lin-que-dropping/src/analysis/processing/processing.py'>

In [26]:
data_path = utils.get_save_path('p', 'twitter', lang='es')/'2022-02-08 16:34:36'/'20210726'/'acordar'/'sample-acordar-tweets-processed.csv'
save_path = data_path.parent
data = utils.get_csv('twitter', data_path)

data.head(2)

INFO:root:Opened config file at: /home/rimov/Documents/Code/NLP/lin-que-dropping/config/general_config.yml


Unnamed: 0,verbs,tweet_id,text_orig,normalized,has_ccomp,dependencies,pos,details
0,"ver, querer",1.419695e+18,"@MicaaFerreiraa Si, si se puede! Quiero ver ju...","@micaaferreiraa si, si se puede! QUIERO VER j...",True,"@micaaferreiraa[ROOT] si[ROOT] , si[mark] se[...",@MicaaFerreiraa(PROPN) Si(INTJ) si(SCONJ) se(P...,"<@MicaaFerreiraa>(@micaaferreiraa,False) <Si>(..."
1,ver,1.419695e+18,Nunca he escuchado ninguna otra nacionalidad d...,nunca he escuchado ninguna otra nacionalidad ...,False,nunca[advmod] he[aux] escuchado[ROOT] ninguna...,Nunca(ADV) he(AUX) escuchado(VERB) ninguna(DET...,"<Nunca>(nunca,True) <he>(haber,True) <escuchad..."


In [27]:
processing.save_by_verb(data, 'twitter', es_conj_path, save_path, 'excel')

INFO:root:Starting save of 20945 entries into /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-02-08 16:34:36/20210726/acordar
DEBUG:root:Separating by verbs: 
{'confesar', 'gritar', 'considerar', 'lamentar', 'saber', 'comprobar', 'conseguir', 'recordar', 'ordenar', 'entender', 'asegurar', 'acordar', 'pensar', 'ver', 'admitir', 'sentir', 'mostrar', 'recomendar', 'desear', 'afirmar', 'rogar', 'solicitar', 'confirmar', 'jurar', 'suspirar', 'creer', 'predecir', 'imaginar', 'querer', 'parecer', 'demostrar', 'pedir', 'negar', 'mencionar', 'ojala', 'mandar', 'prometer', 'suplicar', 'dudar', 'contar', 'responder', 'prever', 'temer', 'suponer', 'apostar', 'reclamar', 'decir', 'lograr', 'esperar', 'adivinar'}
DEBUG:root:Saving 31 entries of (confesar)
INFO:root:Saved dataframe (twitter-es-confesar-2022-02-09 01:46:50) xlsx into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-02-08 16:34:36/20210726/acordar/stative
DEBUG:r

INFO:root:Saved dataframe (twitter-es-demostrar-2022-02-09 01:46:50) xlsx into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-02-08 16:34:36/20210726/acordar/stative
DEBUG:root:Saving 799 entries of (pedir)
INFO:root:Saved dataframe (twitter-es-pedir-2022-02-09 01:46:50) xlsx into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-02-08 16:34:36/20210726/acordar/volitional
DEBUG:root:Saving 118 entries of (negar)
INFO:root:Saved dataframe (twitter-es-negar-2022-02-09 01:46:50) xlsx into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-02-08 16:34:36/20210726/acordar/epistemic
DEBUG:root:Saving 84 entries of (mencionar)
INFO:root:Saved dataframe (twitter-es-mencionar-2022-02-09 01:46:50) xlsx into: /home/rimov/Documents/Code/NLP/lin-que-dropping/data/processed/saved/twitter/es/2022-02-08 16:34:36/20210726/acordar/stative
DEBUG:root:Saving 0 entries of (ojala)
INFO:root:Saved da

In [47]:
merged['verbs'] = merged['verbs'].str.split(', ')
merged.head(3)

Unnamed: 0,tweet_id,verbs,text_orig,text_norm,dependencies,lemma_pos_stopword,retweet_reply_like_quote,created_at,user_id,tweet_place_id,mentions,referenced_tweets
0,1.451193e+18,"[sentir, pedir]",".@CitroenEspana Cactus con 5,5 años. Me empiez...",".Cactus con 5,5 anos. Me empieza a salir oxido...",".Cactus(ROOT) con(case) 5,5(nummod) anos(nmod)...",.Cactus(.Cactus|PROPN|False) con(con|ADP|True)...,"(1, 1, 1, 0)",2021-10-21 14:26:05+00:00,397960900.0,731c9d11275a5436,"[{'start': 1, 'end': 15, 'username': 'CitroenE...",
1,1.451193e+18,[sentir],"Me toy bebiendo un té, y siento como que toy s...","Me toy bebiendo un te, y siento como que toy s...",Me(iobj) toy(ROOT) bebiendo(xcomp) un(det) te(...,Me(yo|PRON|True) toy(tar|VERB|False) bebiendo(...,"(0, 1, 0, 0)",2021-10-21 14:24:35+00:00,1.238228e+18,01fcc4a23f17e1ed,,
2,1.451192e+18,[sentir],El problema más grave que tiene hoy el Maestro...,El problema mas grave que tiene hoy el Maestro...,El(det) problema(nsubj) mas(advmod) grave(amod...,El(el|DET|True) problema(problema|NOUN|False) ...,"(0, 0, 0, 0)",2021-10-21 14:22:15+00:00,234931200.0,01d487de3c4e0807,,


In [52]:
for verb in es_verbs:
    vtype = es_conjugs.loc[es_conjugs['verb']==verb, 'verb_type'].iloc[0].lower()
    has_verb = merged['verbs'].apply(lambda verbs: True if verb in set(verbs) else False)

    df = merged[has_verb].copy()
    
    path = utils.get_save_path('p', lang='es')/processed_folder/vtype
    
    utils.make_dir(path)
    utils.save_excel(path, df, f'twitter-es-{verb}-26-07-2021')