In [1]:
import json
import re
import numpy as np 
import pandas as pd

In [2]:
import importlib.util as imp

# The name 'twitter-connection.connection' seems inconsequential
#   perhaps just a spec name?
spec = imp.spec_from_file_location(
    'twitter-connection.connection', 
    '/home/rimov/Documents/Code/NLP/lin-que-dropping/twitter-connection/connection.py')
twit = imp.module_from_spec(spec)
spec.loader.exec_module(twit)

### References

https://twitter-trends.iamrohit.in/

In [3]:
cred_path = r'../twitter-connection/credentials.txt'

connection = twit.TwitterConnection("spanish", cred_path)

In [4]:
query = ['Sanatorio Otamendi', 'Carnota']

In [5]:
query_cond = ' -is:retweet -has:links '
fields = 'tweet.fields=lang,conversation_id,in_reply_to_user_id,text,public_metrics'

In [6]:
# Dataframes for initial parsing
response = pd.DataFrame()

In [7]:
for q in query:
    d = pd.DataFrame()
    
    while connection.connect(q+query_cond, fields, is_next=True):
        d = d.append(
            pd.json_normalize(connection.response, record_path='data'))
        
        # Remove '@...' mentions
        d.loc[:, 'text'] = d.loc[:, 'text'].str.replace(r'^(@[\w]+ )+', '', regex=True)
        
        dup = d.loc[:, 'text'].duplicated() # Duplicated text
        dup_conv = d.loc[:, 'conversation_id'].duplicated() # Duplicate conv ids.
        lang = d.loc[:, 'lang']!='es' # Wrong language
        replies = (d.loc[:, 'public_metrics.reply_count']==0)\
                  &(d.loc[:, 'in_reply_to_user_id'].isna())
        
        d.drop(
            d.loc[dup|dup_conv|lang|replies].index, 
            inplace=True)
        
        print(d.shape[0])
        
        if d.shape[0] > 10:
            break
    
    response = response.append(d, ignore_index=True)

7
7
13
3
7
4
2
6
1
2
4
2
2
1
3
3
1
2
4
4
4
3
3
4
4
2
1
3
3
3
0
2
1
3
0
3
1
2
0
1
0
0


KeyboardInterrupt: 

In [13]:
response.head()

Unnamed: 0,text,in_reply_to_user_id,conversation_id,id,lang,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
0,Brancatelli en la Trinidad de Palermo\nVerbit...,,1403157168476626944,1403157168476626944,es,23,5,37,1
1,"Parte Médico del Sanatorio Otamendi, el LORO n...",,1402927241798033408,1402927241798033408,es,4,8,15,0
2,qué tiene que ver el raviol con el dulce de le...,2639394153.0,1402655349178605574,1402833184526569473,es,1,1,1,0
3,El Sanatorio Otamendi informó que la paciente ...,1.2491301564946637e+18,1402801920574636033,1402809575347195911,es,0,0,0,0
4,Sanatorio Otamendi... Re Nac&amp;Pop.\n\nEs do...,,1402790811369193473,1402790811369193473,es,2,1,18,0


In [11]:
with open('es_data.txt', 'w') as d:
    d.writelines(response.to_json(orient='table', force_ascii=False))

### Extracting Conversations

In [13]:
conv_ids = response.loc[:, 'conversation_id'].values

print(f'Extracted {len(conv_ids)} convos\n')

Extracted 22 convos



In [14]:
query_conv = 'conversation_id:'
query_conv_cond = ' -has:links '
conv_fields = 'tweet.fields=conversation_id,lang,in_reply_to_user_id,text,attachments,public_metrics'\
  '&expansions=author_id&user.fields=username,public_metrics'

In [19]:
# Dataframe for conversations
convs = pd.DataFrame()
# Dataframes for users
users = pd.DataFrame()

In [20]:
# If the rate limit is hit while querying, save progress
convo_idx = 0

In [21]:
for c_id in conv_ids[convo_idx:]:
    d = pd.DataFrame()
    u = pd.DataFrame()
    
    print(c)
    
    while connection.connect(query_conv + c_id + query_conv_cond, conv_fields, True):
        d = d.append(
            pd.json_normalize(connection.response, record_path='data'))
        u = u.append(
            pd.json_normalize(connection.response['includes'], record_path='users'))
        
        d.loc[:, 'text'] = d.loc[:, 'text'].str.replace(r'^(@[\w]+ )+', '', regex=True)
        
        short_text = d.loc[:, 'text'].apply(len) < 20
        lang = d.loc[:, 'lang']!='es'
        
        d.drop(
            d.loc[lang|short_text].index, 
            inplace=True)
        
        if d.shape[0]>14:
            break
        
    convs = convs.append(d, ignore_index=True)
    users = users.append(u, ignore_index=True)
    
    convo_idx+=1 

1402642305522282500
No next token!
1402637774805667841
No next token!
1402634503173165058
No next token!
1402624093946527745
1402634186909962240
No next token!
1402634112557518854
No next token!
1402633756888944642
No next token!
1402631188284645382
1402633207640641538
No next token!
1402626496989179915
1402629532994125827
No next token!
1402615477172686850
No next token!
1402625048180047879
No next token!
1402624924129366018
No next token!
1402617093514727426
1402131777641193477
No next token!
1402620991369535495
No next token!
1402619563200729092
No next token!
1402623588176478209
No next token!
1402623338535604229
No next token!
1402623254767050756
No next token!
1402454421439717378
No next token!


In [22]:
print(f'Extracted {convs.shape[0]} texts\nSample:\n')
convs.head()

Extracted 109 texts
Sample:



Unnamed: 0,in_reply_to_user_id,conversation_id,author_id,lang,text,id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
0,69416519,1402624093946527745,1537353780,es,"Muchas fuerza Florencia,lo que te desea el mal...",1402658383950454791,0,0,0,0
1,69416519,1402624093946527745,468895557,es,El mal que hace la madre lo pagan los hijos.\n...,1402656218150285315,0,0,0,0
2,69416519,1402624093946527745,1471308854,es,Nunca un hospital público estos garcas. Ni ver...,1402655563792764935,0,0,0,0
3,69416519,1402624093946527745,398461357,es,Porque no muestra la cara ?,1402653558219194372,0,0,0,0
4,69416519,1402624093946527745,169167606,es,Esta mujer esta anoréxica,1402653194174468103,0,0,1,0


In [23]:
with open('es_convs.txt', 'w') as d:
    d.writelines(convs.to_json(orient='table', force_ascii=False))

#### Examining extracted users

## TODO: users were improperly counted; count is WRONG

In [25]:
dups = users.loc[:, 'id'].duplicated()

print(f'Users: {users.shape[0]}, duplicated: {dups.sum()}')

Users: 256, duplicated: 27


In [27]:
users.drop(users.loc[dups].index, inplace=True)
print(users.shape[0])

229


In [28]:
with open('es_users.txt', 'w') as d:
    d.writelines(users.to_json(orient='table', force_ascii=False))