### References

<ul>
    <li>Trending twitter<br></li>
    https://twitter-trends.iamrohit.in/
    <li>Removing accented characters<br></li>
    https://stackoverflow.com/a/2633310/13557629
    <li>Importing package from filepath</li>
    https://stackoverflow.com/a/50395128/13557629
</ul>

In [1]:
import json
import re
import numpy as np 
import pandas as pd

In [5]:
import importlib.util as imp
import sys

spec = imp.spec_from_file_location(
    'twitter_connection', 
    '../twitter-connection/__init__.py')
twit = imp.module_from_spec(spec)
sys.modules[spec.name] = twit
spec.loader.exec_module(twit)

from twitter_connection import connection as tc
from twitter_connection import response as tr

In [3]:
'''
File path to the bearer token. Requires a prefix to identify the
  token, which is just 'PERSONAL $BEARER_TOKEN$' by default -- 
  can be specified upon initialization of TwitterConnection
'''
cred_path = r'../twitter-connection/credentials.txt'

In [4]:
with open('../extraction/verb-stem-clean.txt') as f:
    verb_stem = json.load(f)

### Extraction

In [6]:
connection = tc.TwitterConnection(
    is_archive=True,
    cred_prefix='PROF')

In [7]:
query_cond = ' (lang:es OR lang:pt) max_results=100 has:geo -is:retweet -has:links '
fields_tweet = 'tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets'
fields_expan = 'expansions=author_id,geo.place_id,entities.mentions.username'
fields_user = 'user.fields=created_at,location,public_metrics'
fields_place = 'place.fields=country'

In [7]:
connection.set_query(conditions=query_cond)
connection.set_fields(tweet=fields_tweet, 
                      expansions=fields_expan, 
                      user=fields_user,
                      place=fields_place)

In [15]:
# Dataframes for initial parsing
response_data = pd.DataFrame()

In [None]:
def get_bad_entries(data, stems):
    # Duplicated text
    dup = data.loc[:, 'text'].duplicated()
    # Instances where verb stem is anywhere but beginning of a word
    topic = ~(text.str.contains(
        r'\bve|\bvi', flags=re.IGNORECASE, regex=True))

In [16]:
idx = 0

for vs in verb_stem.items():
    if idx==1:
        break
    
    verb = vs[0]
    stems = '(' + vs[1] + ')'
    print(f'Retrieving tweets: {verb}\nTopics: {stems}')
    
    data = pd.DataFrame()
    
    while connection.connect(stems, is_next=True, time_interval=1):
        data = data.append(
            pd.json_normalize(
                connection.response, record_path='data'))
        
        # Remove '@...' mentions
        data.loc[:, 'text'] = data.loc[:, 'text'].str.replace(r'(@[\w]+ )', '', regex=True)
        
        # Normalize characters -- remove accent marks
        text = data.loc[:, 'text'].apply(unidecode)
        
        print(f'\n{dup.sum()} duplicated, {topic.sum()} bad matches')
        display(
            pd.concat([data.loc[:, 'text'], 
                       dup.rename('is_dup'), 
                       ~(topic.rename('no_stem'))], 
                      axis=1))
        
        for t in text.loc[~topic]:
            n = t.lower().replace('ve', '|>|ve|<|').replace('vi', '|>|vi|<|')
            print(f'CASE:\n{n}\n')
                
                if data.shape[0] >= 10:
                    response_schema = pd.json_normalize(
                        connection.response)
                    break
    
    idx+=1
    response_data = response_data.append(data, ignore_index=True)

Retrieving tweets: ver
Topics: (ve OR vi OR vê)

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,tr,RT @AyazogluCeyhun: VUK359 ne olduğunu daha bi...,True,True
1,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
2,en,Yep I remember doing this and in science class...,True,False
3,en,"RT @DrenDZN: Hello, I'm Dren! I'm 15 years old...",True,True
4,en,I went into the voting booth and saw a list of...,True,False
5,en,Monsignor Demola of the Lalupon diocese......\...,True,False
6,en,"RT @ValaAfshar: “The older you get, the more q...",True,True
7,tr,"RT @FazilDuygun: Zillet başa geçerse, getirece...",True,True
8,es,RT @IAIM_VE: #BuenasNoches✈|| ¡El les desea m...,False,True
9,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False



After dropping: 2

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,tr,RT @haklisinaaleyna: Genelleme yapmayacam sade...,True,True
3,tr,RT @canokrasi: Ankara Vali Yardımıcısı ve Anka...,True,True
4,en,RT @elizamondegreen: Do you oppose bigotry and...,True,True
5,en,something in the rain (9/10) would’ve been a p...,True,False
6,tr,Gittim ve döndüm. Bu bir zafer seslenişidir. B...,True,False
7,en,That’s why I said if everyone is healthy. If w...,True,False
8,en,"RT @DidierPcgba: ""Van Gaal &amp; Mourinho fail...",True,True
9,pt,nem cheguei lá misericórdia já vi que era larr...,False,False



After dropping: 3

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,tr,RT @Gunay_Kodaz: Arkadaşlar bu hesabı tweter a...,True,True
4,en,A beautiful handmade #RemembrancePoppy in hono...,True,False
5,en,I think you mean One England day - the rest of...,True,False
6,en,Hey you're super strong and I'm so proud of yo...,True,False
7,en,RT @muftimenk: The pain you’ve endured has a p...,True,True
8,en,"Awww this makes me a lil bit emo, it's like an...",True,False
9,en,RT @tinybuddha: If I’ve learned anything from ...,True,True



After dropping: 3

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,sv,RT @arevisionen: Bra inlägg i debatten om bost...,True,True
4,en,I would’ve screamed too 😭💀😭 https://t.co/oUfBf...,True,False
5,en,RT @ProductHunt: THE WAIT IS OVER!\n\nWe've te...,True,True
6,en,Lol if I could erase every moment I have of yo...,True,False
7,en,RT @CTULocal1: Educators and workers were so e...,True,True
8,en,RT @SabbiBou: I adore the fact that they've gi...,True,True
9,en,RT @tinybuddha: If I’ve learned anything from ...,True,True



After dropping: 3

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,en,"RT @JacobiteJen: restaurateurs.... Please, ple...",True,True
4,en,RT @agayhomosexual: so i’ve finally set up a g...,True,True
5,es,No es una dictadura en cubierta... está más qu...,False,False
6,en,RT @leoniedelt: So. They are doing passports.\...,True,True
7,tr,"Bızler artık sayıca azız, ve bununla yasamak a...",True,False
8,es,"RT @GloriaCuartas: Se perdió el rumbo, el mand...",False,True
9,tr,RT @akantalyali: Aslında Yılmaz Özdil'in sorus...,True,True



After dropping: 4

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,es,No es una dictadura en cubierta... está más qu...,False,False
4,en,RT @adamtranter: I built a parklet in a car pa...,True,True
5,en,RT @WhiteHouse: New today: we have hit 70% of ...,True,True
6,en,RT @CallumMair: Do you think if we’ve gone fro...,True,True
7,en,RT @beyrima: At Trinity Bellwood’s encampments...,True,True
8,en,"You look naive and desperate, just stay and r...",True,False
9,tr,RT @hazal_kalaycio: Hakan Hatipoğlu: Ayşe ve P...,True,True



After dropping: 4

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,es,No es una dictadura en cubierta... está más qu...,False,False
4,en,Ha ha not at all washed up Paul! I look at tha...,True,False
5,en,RT @BazookaArts: Ross and Jennifer have been p...,True,True
6,en,I've just watched episode S08 | E01 of Grey's ...,True,False
7,en,RT @undynebot: * Pfft!\n* You liar!\n* I’ve RE...,True,True
8,en,RT @thirteenvilll: I’ve been following his sto...,True,True
9,tr,"RT @akbcekmece: İlçe Başkan Yardımcımız, Yönet...",True,True



After dropping: 4

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,es,No es una dictadura en cubierta... está más qu...,False,False
4,pt,eu tb ... acho q da pra ve ate pelo meu perfil...,False,False
5,tr,RT @akbcekmece: İlçe Başkan Yardımcılarımız il...,True,True
6,tr,ay burada çok çok sevdiğim birisi gardropsta s...,True,False
7,pt,vê este tweet daqui a um ano e vais-lhes dar a...,False,False
8,en,Him and Kendrick should have done a joint albu...,True,False
9,pt,Vi o sorvete domingo e preferia não ter visto...,False,False



After dropping: 7

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,es,No es una dictadura en cubierta... está más qu...,False,False
4,pt,eu tb ... acho q da pra ve ate pelo meu perfil...,False,False
5,pt,vê este tweet daqui a um ano e vais-lhes dar a...,False,False
6,pt,Vi o sorvete domingo e preferia não ter visto...,False,False
7,en,simm vi uns feedbacks positivos,True,False
8,es,"No supero el ""remix"" de coronaver-veRr-veRr-ve...",False,False
9,en,RT @adventintl: We love to partner with innova...,True,True



After dropping: 9

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,es,No es una dictadura en cubierta... está más qu...,False,False
4,pt,eu tb ... acho q da pra ve ate pelo meu perfil...,False,False
5,pt,vê este tweet daqui a um ano e vais-lhes dar a...,False,False
6,pt,Vi o sorvete domingo e preferia não ter visto...,False,False
7,es,"No supero el ""remix"" de coronaver-veRr-veRr-ve...",False,False
8,es,"Kunze ve preparando tu cara de imbécil, porq v...",False,False
9,es,"RT @culturabang: Si nadie me ve, no me importa...",False,True



After dropping: 9

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,es,No es una dictadura en cubierta... está más qu...,False,False
4,pt,eu tb ... acho q da pra ve ate pelo meu perfil...,False,False
5,pt,vê este tweet daqui a um ano e vais-lhes dar a...,False,False
6,pt,Vi o sorvete domingo e preferia não ter visto...,False,False
7,es,"No supero el ""remix"" de coronaver-veRr-veRr-ve...",False,False
8,es,"Kunze ve preparando tu cara de imbécil, porq v...",False,False
9,tr,Hiçbir şey yapmadıkları halde herkes batuhan v...,True,False



After dropping: 9

Undesirables:


Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,es,No escucho radio pero vi tuiter. Me molesta cu...,False,False
1,pt,Possivelmente spoiler de jojo mas lembro que q...,False,False
2,pt,nem cheguei lá misericórdia já vi que era larr...,False,False
3,es,No es una dictadura en cubierta... está más qu...,False,False
4,pt,eu tb ... acho q da pra ve ate pelo meu perfil...,False,False
5,pt,vê este tweet daqui a um ano e vais-lhes dar a...,False,False
6,pt,Vi o sorvete domingo e preferia não ter visto...,False,False
7,es,"No supero el ""remix"" de coronaver-veRr-veRr-ve...",False,False
8,es,"Kunze ve preparando tu cara de imbécil, porq v...",False,False
9,en,RT @duty2warn: It's much easier to fool people...,True,True



After dropping: 10


In [20]:
response_schema.head()

Unnamed: 0,data,includes.users,meta.newest_id,meta.oldest_id,meta.result_count,meta.next_token
0,"[{'public_metrics': {'retweet_count': 128, 're...","[{'created_at': '2020-09-30T20:34:25.000Z', 'l...",1407390756499644417,1407390755568558081,10,b26v89c19zqg8o3fpdg9v3guagisc8v1oz1p861y3ntrx


In [19]:
response_data.head()

Unnamed: 0,id,referenced_tweets,author_id,text,lang,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,entities.mentions
0,1407390766608130049,"[{'type': 'replied_to', 'id': '140738243078955...",1395367675501625344,No escucho radio pero vi tuiter. Me molesta cu...,es,0,0,0,0,"[{'start': 0, 'end': 15, 'username': 'carolina..."
1,1407390765802795008,,1154920062018609152,Possivelmente spoiler de jojo mas lembro que q...,pt,0,1,0,0,
2,1407390764372480000,"[{'type': 'replied_to', 'id': '140730775902920...",372950623,nem cheguei lá misericórdia já vi que era larr...,pt,0,0,0,0,"[{'start': 0, 'end': 15, 'username': 'gracesth..."
3,1407390761872728065,"[{'type': 'quoted', 'id': '1407371402777333763'}]",956767298139893760,No es una dictadura en cubierta... está más qu...,es,5,0,7,0,
4,1407390759268069379,"[{'type': 'replied_to', 'id': '140738976598557...",1404408219016089605,eu tb ... acho q da pra ve ate pelo meu perfil...,pt,0,1,0,0,"[{'start': 0, 'end': 11, 'username': 'mycrapIi..."


In [59]:
lang = response.loc[:, 'lang']!='es'

In [60]:
response.drop(response.loc[lang, :].index, inplace=True)
response

Unnamed: 0,lang,id,conversation_id,text,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,in_reply_to_user_id
4,es,1404865872515194884,1404865188109684747,"se ve tan real, además L y H se ven muy guapos",0,1,1,0,920381329669226497
32,es,1404865864143343620,1404631106876542976,Fraude es tu vida se sabia que mucha gente en ...,0,1,0,0,1252820905396301835
41,es,1404865862096535563,1404564648083525634,Sí ve a decirle a tu lider que le has defendid...,0,1,0,0,1362009157558292483
44,es,1404865862012702722,1404719991237251078,Y si su comportamiento es igual al este papelo...,0,0,1,0,1281968549330857986


In [61]:
response.loc[:, 'text'] = response.loc[:, 'text'].str.replace(r'(@\w+)\b', '', regex=True)
response

Unnamed: 0,lang,id,conversation_id,text,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,in_reply_to_user_id
4,es,1404865872515194884,1404865188109684747,"se ve tan real, además L y H se ven muy guapos",0,1,1,0,920381329669226497
32,es,1404865864143343620,1404631106876542976,Fraude es tu vida se sabia que mucha gente en ...,0,1,0,0,1252820905396301835
41,es,1404865862096535563,1404564648083525634,Sí ve a decirle a tu lider que le has defendid...,0,1,0,0,1362009157558292483
44,es,1404865862012702722,1404719991237251078,Y si su comportamiento es igual al este papelo...,0,0,1,0,1281968549330857986


In [63]:
response.loc[:, 'text'].apply(split)

NameError: name 'split' is not defined

In [62]:
for a in response.loc[:, 'text']:
    print(f'{a}\n')

se ve tan real, además L y H se ven muy guapos

Fraude es tu vida se sabia que mucha gente en el sur está votando por castillo qué pasa se ve que no has viajado por la ciudades

Sí ve a decirle a tu lider que le has defendido bien.

Y si su comportamiento es igual al este papelon. Obviamente que si. O pensas que se critica lo que se ve con hormonas ?? 🤔



In [11]:
with open('es_data.txt', 'w') as d:
    d.writelines(response.to_json(orient='table', force_ascii=False))

### Extracting Conversations

In [13]:
conv_ids = response.loc[:, 'conversation_id'].values

print(f'Extracted {len(conv_ids)} convos\n')

Extracted 22 convos



In [14]:
query_conv = 'conversation_id:'
query_conv_cond = ' -has:links '
conv_fields = 'tweet.fields=conversation_id,lang,in_reply_to_user_id,text,attachments,public_metrics'\
  '&expansions=author_id&user.fields=username,public_metrics'

In [19]:
# Dataframe for conversations
convs = pd.DataFrame()
# Dataframes for users
users = pd.DataFrame()

In [20]:
# If the rate limit is hit while querying, save progress
convo_idx = 0

In [21]:
for c_id in conv_ids[convo_idx:]:
    d = pd.DataFrame()
    u = pd.DataFrame()
    
    print(c)
    
    while connection.connect(query_conv + c_id + query_conv_cond, conv_fields, True):
        d = d.append(
            pd.json_normalize(connection.response, record_path='data'))
        u = u.append(
            pd.json_normalize(connection.response['includes'], record_path='users'))
        
        d.loc[:, 'text'] = d.loc[:, 'text'].str.replace(r'^(@[\w]+ )+', '', regex=True)
        
        short_text = d.loc[:, 'text'].apply(len) < 20
        lang = d.loc[:, 'lang']!='es'
        
        d.drop(
            d.loc[lang|short_text].index, 
            inplace=True)
        
        if d.shape[0]>14:
            break
        
    convs = convs.append(d, ignore_index=True)
    users = users.append(u, ignore_index=True)
    
    convo_idx+=1 

1402642305522282500
No next token!
1402637774805667841
No next token!
1402634503173165058
No next token!
1402624093946527745
1402634186909962240
No next token!
1402634112557518854
No next token!
1402633756888944642
No next token!
1402631188284645382
1402633207640641538
No next token!
1402626496989179915
1402629532994125827
No next token!
1402615477172686850
No next token!
1402625048180047879
No next token!
1402624924129366018
No next token!
1402617093514727426
1402131777641193477
No next token!
1402620991369535495
No next token!
1402619563200729092
No next token!
1402623588176478209
No next token!
1402623338535604229
No next token!
1402623254767050756
No next token!
1402454421439717378
No next token!


In [22]:
print(f'Extracted {convs.shape[0]} texts\nSample:\n')
convs.head()

Extracted 109 texts
Sample:



Unnamed: 0,in_reply_to_user_id,conversation_id,author_id,lang,text,id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
0,69416519,1402624093946527745,1537353780,es,"Muchas fuerza Florencia,lo que te desea el mal...",1402658383950454791,0,0,0,0
1,69416519,1402624093946527745,468895557,es,El mal que hace la madre lo pagan los hijos.\n...,1402656218150285315,0,0,0,0
2,69416519,1402624093946527745,1471308854,es,Nunca un hospital público estos garcas. Ni ver...,1402655563792764935,0,0,0,0
3,69416519,1402624093946527745,398461357,es,Porque no muestra la cara ?,1402653558219194372,0,0,0,0
4,69416519,1402624093946527745,169167606,es,Esta mujer esta anoréxica,1402653194174468103,0,0,1,0


In [23]:
with open('es_convs.txt', 'w') as d:
    d.writelines(convs.to_json(orient='table', force_ascii=False))

#### Examining extracted users

## TODO: users were improperly counted; count is WRONG

In [25]:
dups = users.loc[:, 'id'].duplicated()

print(f'Users: {users.shape[0]}, duplicated: {dups.sum()}')

Users: 256, duplicated: 27


In [27]:
users.drop(users.loc[dups].index, inplace=True)
print(users.shape[0])

229


In [28]:
with open('es_users.txt', 'w') as d:
    d.writelines(users.to_json(orient='table', force_ascii=False))