In [1]:
import json
import re
import numpy as np 
import pandas as pd

In [2]:
import importlib.util as imp

# The name 'twitter-connection.connection' seems inconsequential
#   perhaps just a spec name?
spec = imp.spec_from_file_location(
    'twitter-connection.connection', 
    '/home/rimov/Documents/Code/NLP/lin-que-dropping/twitter-connection/connection.py')
twit = imp.module_from_spec(spec)
spec.loader.exec_module(twit)

### References

https://twitter-trends.iamrohit.in/

In [28]:
cred_path = r'../twitter-connection/credentials.txt'

connection = twit.TwitterConnection("spanish", cred_path)

In [29]:
with open('../extraction/verb-stem-clean.txt') as f:
    verb_stem = json.load(f)

In [30]:
query_cond = ' (lang:es OR lang:pt) has:geo -is:retweet -has:links '
# fields_tweet = 'tweet.fields=lang,text,public_metrics,organic_metrics,geo,referenced_tweets'
fields_tweet = 'tweet.fields=lang,geo,public_metrics,referenced_tweets'
fields_expan = 'expansions=author_id,geo.place_id,entities.mentions.username'
fields_user = 'user.fields=created_at,location,public_metrics'
fields_place = 'place.fields=country'

In [38]:
connection.set_query(conditions=query_cond)
connection.set_fields(tweet=fields_tweet, 
                      expansions=fields_expan, 
                      user=fields_user,
                      place=fields_place)

In [39]:
# Dataframes for initial parsing
response = pd.DataFrame()
desc = pd.DataFrame()

In [42]:
idx = 0

for vs in verb_stem.items():
    if idx==1:
        break
    
    verb = vs[0]
    stems = '(' + vs[1] + ')'
    print(f'Retrieving tweets: {verb}\nTopics: {stems}')
    
    data = pd.DataFrame()
    
    while connection.connect(stems, is_next=True):
        desc = pd.json_normalize(
                connection.response)
        
        print('Data before append:\n')
        display(data.head(3))
        
        data = data.append(
            pd.json_normalize(
                connection.response, record_path='data'))
        
        print('Data after append:\n')
        display(data.head(3))
        
        # Duplicated text
#         dup = d.loc[:, 'text'].duplicated() 
        # Wrong language
        lang = (data.loc[:, 'lang']!='es') & (data.loc[:, 'lang']!='pt')
        # Is a retweet
        retweet = data.loc[:, 'text'].str.contains('RT @')
        
        # Remove '@...' mentions
        data.loc[:, 'text'] = data.loc[:, 'text'].str.replace(r'(@[\w]+ )', '', regex=True)
        
        print('Undesirables:\n')
        display(
            pd.concat([data.loc[:, ['lang', 'text']], 
                       lang.rename('is_wrong_lang'), 
                       retweet.rename('is_retweet')], 
                      axis=1))
        
        data.drop(
            data.loc[lang|retweet].index, 
            inplace=True)
        
        print('After dropping:\n')
        display(data.head(3))
        
        print(data.shape[0])
        
        if d.shape[0] >= 1:
            break
    
    idx+=1
    response = response.append(d, ignore_index=True)

Retrieving tweets: ver
Topics: (ve OR vi OR v√™)
Data before append:



Data after append:



Unnamed: 0,text,lang,referenced_tweets,author_id,id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,entities.mentions
0,RT @BTStranslation_: JUNGKOOK'S SELF ACHIEVEME...,en,"[{'type': 'retweeted', 'id': '1407275882486456...",1201545522915483654,1407329824436326405,4789,0,0,0,"[{'start': 3, 'end': 19, 'username': 'BTStrans..."
1,RT @Yenerhoca61: @nacibostanci ƒ∞ki yƒ±ldƒ±r fark...,tr,"[{'type': 'retweeted', 'id': '1407260143046234...",1197923514986172416,1407329824360775690,4,0,0,0,"[{'start': 3, 'end': 15, 'username': 'Yenerhoc..."
2,√önico jogo que n√£o vi do Goi√°s foi o jogo que ...,pt,,193294214,1407329823945601033,0,0,0,0,
3,"RT @Lalin793352: 22 July 2018\nP'Mew: P'Mean, ...",en,"[{'type': 'retweeted', 'id': '1407329465085071...",1346765350117859328,1407329823891070976,8,0,0,0,"[{'start': 3, 'end': 15, 'username': 'Lalin793..."
4,RT @fanii_aliii: This is the current situation...,en,"[{'type': 'retweeted', 'id': '1407212807897165...",1259069567961268225,1407329823824003073,193,0,0,0,"[{'start': 3, 'end': 15, 'username': 'fanii_al..."


Undesirables:



Unnamed: 0,lang,text,is_wrong_lang,is_retweet
0,en,RT @BTStranslation_: JUNGKOOK'S SELF ACHIEVEME...,True,True
1,tr,RT @Yenerhoca61: ƒ∞ki yƒ±ldƒ±r farklƒ± kanallardan...,True,True
2,pt,√önico jogo que n√£o vi do Goi√°s foi o jogo que ...,False,False
3,en,"RT @Lalin793352: 22 July 2018\nP'Mew: P'Mean, ...",True,True
4,en,RT @fanii_aliii: This is the current situation...,True,True
5,tr,RT @Pehlivan_Reis: 3Ô∏è‚É£Yahudilik ve Hƒ±ristiyanl...,True,True
6,en,You've got this champ. Take a some time off!üíôüëä...,True,False
7,es,RT @GiuseppeNoc: Lo cierto es que las personas...,False,True
8,en,If it‚Äôs one thing I can‚Äôt stand it‚Äôs a mf who ...,True,False
9,tr,RT @beehaber: √áocuk masalƒ± adƒ± altƒ±nda ensest ...,True,True


After dropping:



Unnamed: 0,text,lang,referenced_tweets,author_id,id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,entities.mentions
2,√önico jogo que n√£o vi do Goi√°s foi o jogo que ...,pt,,193294214,1407329823945601033,0,0,0,0,


1
Error establishing connection:

 429
{"title":"Too Many Requests","type":"about:blank","status":429,"detail":"Too Many Requests"}


AttributeError: 'NoneType' object has no attribute 'json'

In [43]:
display(desc.head())

Unnamed: 0,data,includes.users,meta.newest_id,meta.oldest_id,meta.result_count,meta.next_token
0,[{'text': 'RT @BTStranslation_: JUNGKOOK'S SEL...,"[{'username': 'Litle_Bearbaby', 'id': '1201545...",1407329824436326405,1407329823639408641,10,b26v89c19zqg8o3fpdg9v3er54pmioayri49yvvsfwhvh


In [59]:
lang = response.loc[:, 'lang']!='es'

In [60]:
response.drop(response.loc[lang, :].index, inplace=True)
response

Unnamed: 0,lang,id,conversation_id,text,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,in_reply_to_user_id
4,es,1404865872515194884,1404865188109684747,"se ve tan real, adem√°s L y H se ven muy guapos",0,1,1,0,920381329669226497
32,es,1404865864143343620,1404631106876542976,Fraude es tu vida se sabia que mucha gente en ...,0,1,0,0,1252820905396301835
41,es,1404865862096535563,1404564648083525634,S√≠ ve a decirle a tu lider que le has defendid...,0,1,0,0,1362009157558292483
44,es,1404865862012702722,1404719991237251078,Y si su comportamiento es igual al este papelo...,0,0,1,0,1281968549330857986


In [61]:
response.loc[:, 'text'] = response.loc[:, 'text'].str.replace(r'(@\w+)\b', '', regex=True)
response

Unnamed: 0,lang,id,conversation_id,text,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,in_reply_to_user_id
4,es,1404865872515194884,1404865188109684747,"se ve tan real, adem√°s L y H se ven muy guapos",0,1,1,0,920381329669226497
32,es,1404865864143343620,1404631106876542976,Fraude es tu vida se sabia que mucha gente en ...,0,1,0,0,1252820905396301835
41,es,1404865862096535563,1404564648083525634,S√≠ ve a decirle a tu lider que le has defendid...,0,1,0,0,1362009157558292483
44,es,1404865862012702722,1404719991237251078,Y si su comportamiento es igual al este papelo...,0,0,1,0,1281968549330857986


In [63]:
response.loc[:, 'text'].apply(split)

NameError: name 'split' is not defined

In [62]:
for a in response.loc[:, 'text']:
    print(f'{a}\n')

se ve tan real, adem√°s L y H se ven muy guapos

Fraude es tu vida se sabia que mucha gente en el sur est√° votando por castillo qu√© pasa se ve que no has viajado por la ciudades

S√≠ ve a decirle a tu lider que le has defendido bien.

Y si su comportamiento es igual al este papelon. Obviamente que si. O pensas que se critica lo que se ve con hormonas ?? ü§î



In [11]:
with open('es_data.txt', 'w') as d:
    d.writelines(response.to_json(orient='table', force_ascii=False))

### Extracting Conversations

In [13]:
conv_ids = response.loc[:, 'conversation_id'].values

print(f'Extracted {len(conv_ids)} convos\n')

Extracted 22 convos



In [14]:
query_conv = 'conversation_id:'
query_conv_cond = ' -has:links '
conv_fields = 'tweet.fields=conversation_id,lang,in_reply_to_user_id,text,attachments,public_metrics'\
  '&expansions=author_id&user.fields=username,public_metrics'

In [19]:
# Dataframe for conversations
convs = pd.DataFrame()
# Dataframes for users
users = pd.DataFrame()

In [20]:
# If the rate limit is hit while querying, save progress
convo_idx = 0

In [21]:
for c_id in conv_ids[convo_idx:]:
    d = pd.DataFrame()
    u = pd.DataFrame()
    
    print(c)
    
    while connection.connect(query_conv + c_id + query_conv_cond, conv_fields, True):
        d = d.append(
            pd.json_normalize(connection.response, record_path='data'))
        u = u.append(
            pd.json_normalize(connection.response['includes'], record_path='users'))
        
        d.loc[:, 'text'] = d.loc[:, 'text'].str.replace(r'^(@[\w]+ )+', '', regex=True)
        
        short_text = d.loc[:, 'text'].apply(len) < 20
        lang = d.loc[:, 'lang']!='es'
        
        d.drop(
            d.loc[lang|short_text].index, 
            inplace=True)
        
        if d.shape[0]>14:
            break
        
    convs = convs.append(d, ignore_index=True)
    users = users.append(u, ignore_index=True)
    
    convo_idx+=1 

1402642305522282500
No next token!
1402637774805667841
No next token!
1402634503173165058
No next token!
1402624093946527745
1402634186909962240
No next token!
1402634112557518854
No next token!
1402633756888944642
No next token!
1402631188284645382
1402633207640641538
No next token!
1402626496989179915
1402629532994125827
No next token!
1402615477172686850
No next token!
1402625048180047879
No next token!
1402624924129366018
No next token!
1402617093514727426
1402131777641193477
No next token!
1402620991369535495
No next token!
1402619563200729092
No next token!
1402623588176478209
No next token!
1402623338535604229
No next token!
1402623254767050756
No next token!
1402454421439717378
No next token!


In [22]:
print(f'Extracted {convs.shape[0]} texts\nSample:\n')
convs.head()

Extracted 109 texts
Sample:



Unnamed: 0,in_reply_to_user_id,conversation_id,author_id,lang,text,id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
0,69416519,1402624093946527745,1537353780,es,"Muchas fuerza Florencia,lo que te desea el mal...",1402658383950454791,0,0,0,0
1,69416519,1402624093946527745,468895557,es,El mal que hace la madre lo pagan los hijos.\n...,1402656218150285315,0,0,0,0
2,69416519,1402624093946527745,1471308854,es,Nunca un hospital p√∫blico estos garcas. Ni ver...,1402655563792764935,0,0,0,0
3,69416519,1402624093946527745,398461357,es,Porque no muestra la cara ?,1402653558219194372,0,0,0,0
4,69416519,1402624093946527745,169167606,es,Esta mujer esta anor√©xica,1402653194174468103,0,0,1,0


In [23]:
with open('es_convs.txt', 'w') as d:
    d.writelines(convs.to_json(orient='table', force_ascii=False))

#### Examining extracted users

## TODO: users were improperly counted; count is WRONG

In [25]:
dups = users.loc[:, 'id'].duplicated()

print(f'Users: {users.shape[0]}, duplicated: {dups.sum()}')

Users: 256, duplicated: 27


In [27]:
users.drop(users.loc[dups].index, inplace=True)
print(users.shape[0])

229


In [28]:
with open('es_users.txt', 'w') as d:
    d.writelines(users.to_json(orient='table', force_ascii=False))