### References

<ul>
    <li>Trending twitter<br></li>
    https://twitter-trends.iamrohit.in/
    <li>Removing accented characters<br></li>
    https://stackoverflow.com/a/2633310/13557629
    <li>Importing package from filepath</li>
    https://stackoverflow.com/a/50395128/13557629
</ul>

In [1]:
import json
import re
import numpy as np 
import pandas as pd

In [2]:
from time import sleep
from unidecode import unidecode
import es_dep_news_trf
import pt_core_news_lg

In [3]:
import importlib.util as imp
import sys

spec = imp.spec_from_file_location(
    'twitter_connection', 
    '../twitter-connection/__init__.py')
twit = imp.module_from_spec(spec)
sys.modules[spec.name] = twit
spec.loader.exec_module(twit)

from twitter_connection import connection as tc
from twitter_connection import response as tr

In [4]:
'''
File path to the bearer token. Requires a prefix to identify the
  token, which is just 'PERSONAL $BEARER_TOKEN$' by default -- 
  can be specified upon initialization of TwitterConnection
'''
cred_path = r'../twitter-connection/credentials.txt'

In [5]:
# All the Portuguese-only verbs
pt_verbs = {'dizer', 'supor', 'duvidar', 'acreditar', 'achar', 'lembrar', 'recear', 'predizer', 'adivinhar', 'conjeturar', 'chutar', 'dar(se) conta', 'desejar', 'oxalá', 'tomara'}

In [6]:
with open('../extraction/verb-stem-clean.txt') as f:
    verb_stem = json.load(f)
    
verbs_volit = {vs[0]: vs[1] for vs in list(verb_stem.items())[len(verb_stem)-17:]}
es_verbs_volit = {v: verbs_volit[v] for v in (verbs_volit.keys() - pt_verbs)}

15

### Extraction

In [8]:
es_query_cond = 'lang:es has:geo -is:retweet -has:links '
pt_query_cond = 'lang:pt has:geo -is:retweet -has:links '
fields_tweet = 'tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets'
fields_expan = 'expansions=author_id,geo.place_id,entities.mentions.username'
fields_user = 'user.fields=created_at,location,public_metrics'
fields_place = 'place.fields=country'

In [9]:
es_conn = tc.TwitterConnection(
    is_archive=True,
    cred_prefix='PROF')

pt_conn = tc.TwitterConnection(
    is_archive=True,
    cred_prefix='PROF') 

In [10]:
es_conn.set_query(conditions=es_query_cond)
es_conn.set_fields(tweet=fields_tweet, 
                      expansions=fields_expan, 
                      user=fields_user,
                      place=fields_place)

pt_conn.set_query(conditions=pt_query_cond)
pt_conn.set_fields(tweet=fields_tweet, 
                      expansions=fields_expan, 
                      user=fields_user,
                      place=fields_place)

In [11]:
nlp_es = es_dep_news_trf.load()
nlp_pt = pt_core_news_lg.load()

In [12]:
def get_pos_tags(tokenized):
    return ' '.join([f'{t.text}-({t.pos_})' for t in tokenized if ((t.pos_!='PUNCT') and (t.pos_!='SPACE'))])

In [13]:
def lemmatize(tokenized):
    return ' '.join([t.lemma_ for t in tokenized])

In [14]:
def analyze(text, lang):
    # Tokenized
    text_nlp = text.loc[:, 'text'].apply(nlp_es if lang=='es' else nlp_pt)
    
    pos = text_nlp.apply(get_pos_tags).rename('pos')
    lemma = text_nlp.apply(lemmatize).rename('lemma')
    
    return pd.concat([text.loc[:, 'id'], pos, lemma], axis=1)

In [15]:
es_verbs = es_verb_stem

In [18]:
delinqs = ['ver', 'dar(se) cuenta']

for v in delinqs:
    es_verbs[v] = 0

es_verbs.values()

dict_values([0, 'jur', 'dig OR dec OR dij OR dir OR dic', 'confies OR confes', 'promet', 'muestr OR mostr', 'demuestr OR demostr', 'afirm', 'confirm', 'consig OR conseg', 'logr', 'respond', 'admit', 'consider', 'aesgur', 'mencion', 'grit', 'suspir', 'reclam', 'cont OR cuent', 'supon', 'sé OR sab OR sup OR sep', 'piens OR pens', 'imagin', 'dud', 'cre OR kre', 'recuerd OR record', 'acuerdo OR acord', 'tem', 'recomiend OR recomend', 'parec OR parezc OR pareç', 'entiend OR entend', 'neg OR nieg', 'apuest OR apost', 'predig OR predic OR predec OR predij', 'preve', 'sient OR sint OR sent', 'comprueb OR comprob OR comprov', 'adivin', 0, 'lament', 'rueg OR rog', 'quer OR quier OR quis', 'esper', 'dese', 'pid OR ped OR peç', 'ojalá OR ohalá', 'suplic OR supliq', 'solicit', 'mand', 'orden', 'insist', 'sugier OR suger OR sugir', 'preocup', 'alegr', 0])

In [19]:
for vs in es_verbs.items():
    if vs[1]==0:
        continue
    
    verb = vs[0]
    stems = '(' + vs[1] + ')'
    print(f'Retrieving tweets: {verb}\nTopics: {stems}')
    
    response = tr.Response()
    
    while es_conn.connect(stems, is_next=True, time_interval=1):
        if len(response.schema)==0:
            print(es_conn.url)
        
        new = tr.Response(es_conn.response)
        
        # Remove '@...' mentions
        new.schema['data'].loc[:, 'text'] = new.schema['data'].loc[:, 'text']\
            .str.replace(r'(@[\w]+ )', '', regex=True)\
            .apply(unidecode)
        
        text_analyzed = analyze(new.schema['data'].loc[:, ['id', 'text']], 'es')
        
        # Entries without desired verb
        no_verb = ~(text_analyzed.loc[:, 'lemma'].str.contains(verb))
        print(f'Found {no_verb.sum()} without "{verb}"')
        
        new.join(to='data', data=text_analyzed, on='id')
        
        new.schema['data'].drop(new.schema['data'].loc[no_verb, :].index, inplace=True)
        response.reset_index()       
            
        response.append(new)
        
        if response.schema['data'].shape[0] >= 200:
            break
    
    response.to_csv('es', verb)
    es_verbs[verb] = 0

Retrieving tweets: jurar
Topics: (jur)
No next token! next_token
Failed to rename some columns -- not found?
Exception during merge for CSV! data
Retrieving tweets: decir
Topics: (dig OR dec OR dij OR dir OR dic)
https://api.twitter.com/2/tweets/search/all?query=(dig OR dec OR dij OR dir OR dic) lang:es has:geo -is:retweet -has:links &max_results=100&tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets&expansions=author_id,geo.place_id,entities.mentions.username&user.fields=created_at,location,public_metrics&place.fields=country
Found 93 without "decir"
Found 95 without "decir"
Before append: 7
After append: 12
Found 98 without "decir"
Before append: 12
After append: 14
Found 94 without "decir"
Before append: 14
After append: 19
Found 93 without "decir"
Before append: 19
After append: 26
Found 95 without "decir"
Before append: 26
After append: 31
No next token! next_token
Retrieving tweets: confesar
Topics: (confies OR confes)
https://api.twitter.com/2/tweets/search/all?qu

KeyboardInterrupt: 

### Extracting Conversations

In [13]:
conv_ids = response.loc[:, 'conversation_id'].values

print(f'Extracted {len(conv_ids)} convos\n')

Extracted 22 convos



In [14]:
query_conv = 'conversation_id:'
query_conv_cond = ' -has:links '
conv_fields = 'tweet.fields=conversation_id,lang,in_reply_to_user_id,text,attachments,public_metrics'\
  '&expansions=author_id&user.fields=username,public_metrics'

In [19]:
# Dataframe for conversations
convs = pd.DataFrame()
# Dataframes for users
users = pd.DataFrame()

In [20]:
# If the rate limit is hit while querying, save progress
convo_idx = 0

In [21]:
for c_id in conv_ids[convo_idx:]:
    d = pd.DataFrame()
    u = pd.DataFrame()
    
    print(c)
    
    while connection.connect(query_conv + c_id + query_conv_cond, conv_fields, True):
        d = d.append(
            pd.json_normalize(connection.response, record_path='data'))
        u = u.append(
            pd.json_normalize(connection.response['includes'], record_path='users'))
        
        d.loc[:, 'text'] = d.loc[:, 'text'].str.replace(r'^(@[\w]+ )+', '', regex=True)
        
        short_text = d.loc[:, 'text'].apply(len) < 20
        lang = d.loc[:, 'lang']!='es'
        
        d.drop(
            d.loc[lang|short_text].index, 
            inplace=True)
        
        if d.shape[0]>14:
            break
        
    convs = convs.append(d, ignore_index=True)
    users = users.append(u, ignore_index=True)
    
    convo_idx+=1 

1402642305522282500
No next token!
1402637774805667841
No next token!
1402634503173165058
No next token!
1402624093946527745
1402634186909962240
No next token!
1402634112557518854
No next token!
1402633756888944642
No next token!
1402631188284645382
1402633207640641538
No next token!
1402626496989179915
1402629532994125827
No next token!
1402615477172686850
No next token!
1402625048180047879
No next token!
1402624924129366018
No next token!
1402617093514727426
1402131777641193477
No next token!
1402620991369535495
No next token!
1402619563200729092
No next token!
1402623588176478209
No next token!
1402623338535604229
No next token!
1402623254767050756
No next token!
1402454421439717378
No next token!


In [22]:
print(f'Extracted {convs.shape[0]} texts\nSample:\n')
convs.head()

Extracted 109 texts
Sample:



Unnamed: 0,in_reply_to_user_id,conversation_id,author_id,lang,text,id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
0,69416519,1402624093946527745,1537353780,es,"Muchas fuerza Florencia,lo que te desea el mal...",1402658383950454791,0,0,0,0
1,69416519,1402624093946527745,468895557,es,El mal que hace la madre lo pagan los hijos.\n...,1402656218150285315,0,0,0,0
2,69416519,1402624093946527745,1471308854,es,Nunca un hospital público estos garcas. Ni ver...,1402655563792764935,0,0,0,0
3,69416519,1402624093946527745,398461357,es,Porque no muestra la cara ?,1402653558219194372,0,0,0,0
4,69416519,1402624093946527745,169167606,es,Esta mujer esta anoréxica,1402653194174468103,0,0,1,0


In [23]:
with open('es_convs.txt', 'w') as d:
    d.writelines(convs.to_json(orient='table', force_ascii=False))

#### Examining extracted users

## TODO: users were improperly counted; count is WRONG

In [25]:
dups = users.loc[:, 'id'].duplicated()

print(f'Users: {users.shape[0]}, duplicated: {dups.sum()}')

Users: 256, duplicated: 27


In [27]:
users.drop(users.loc[dups].index, inplace=True)
print(users.shape[0])

229


In [28]:
with open('es_users.txt', 'w') as d:
    d.writelines(users.to_json(orient='table', force_ascii=False))