In [1]:
import json
import re
import numpy as np 
import pandas as pd
from datetime import datetime as dt
from time import sleep
import es_dep_news_trf
import pt_core_news_lg

In [2]:
import importlib.util as imp
import sys

spec = imp.spec_from_file_location(
    'twitter_connection', 
    '../twitter-connection/__init__.py')
twit = imp.module_from_spec(spec)
sys.modules[spec.name] = twit
spec.loader.exec_module(twit)

from twitter_connection import connection as tc
from twitter_connection import response as tr

In [3]:
'''
File path to the bearer token. Requires a prefix to identify the
  token, which is just 'PERSONAL $BEARER_TOKEN$' by default -- 
  can be specified upon initialization of TwitterConnection
'''
cred_path = r'../twitter-connection/credentials.txt'

In [4]:
# All the Portuguese-only verbs
pt_verbs = {'dizer', 'supor', 'duvidar', 'acreditar', 'achar', 'lembrar', 'recear', 'predizer', 'adivinhar', 'conjeturar', 'chutar', 'dar(se) conta', 'desejar', 'oxala', 'tomara'}

In [5]:
with open('../extraction/verb-stem-clean.txt') as f:
    verb_stem = json.load(f)
    
verbs_volit = [v for v in list(verb_stem.keys())[len(verb_stem)-17:]]
es_verbs_volit = [v for v in (set(verbs_volit) - pt_verbs)]

In [6]:
es_query_cond = 'lang:es has:geo -is:retweet -has:links '
pt_query_cond = 'lang:pt has:geo -is:retweet -has:links '
fields_tweet = 'tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets'
fields_expan = 'expansions=author_id,geo.place_id,entities.mentions.username'
fields_user = 'user.fields=created_at,location,public_metrics'
fields_place = 'place.fields=country'

In [7]:
es_conn = tc.TwitterConnection(
    is_archive=True,
    cred_prefix='PROF')

# pt_conn = tc.TwitterConnection(
#     is_archive=True,
#     cred_prefix='PROF') 

In [8]:
es_conn.set_query(conditions=es_query_cond)
es_conn.set_fields(tweet=fields_tweet, 
                      expansions=fields_expan, 
                      user=fields_user,
                      place=fields_place)

# pt_conn.set_query(conditions=pt_query_cond)
# pt_conn.set_fields(tweet=fields_tweet, 
#                       expansions=fields_expan, 
#                       user=fields_user,
#                       place=fields_place)

In [13]:
time = dt.now().strftime('%d%m%Y-at-%H%M')

In [14]:
topics = ['Jorge Ramos', 'Cárdenas Palomino', '"harry styles"', 'Richard Donner', 'Generales', 'olivia', 'Criminales', 'Sofia', 'Alba', '"La Posta"', 'Peru', 'Capital', 'Neymar', 'Superman', 'Gallese']

v = 'suplicar'
s = '"supl" OR "supli"'

test_conn = tc.TwitterConnection(
    is_archive=True,
    cred_prefix='PROF')

test_conn.set_query(conditions=es_query_cond)
test_conn.set_fields(tweet=fields_tweet, 
                      expansions=fields_expan, 
                      user=fields_user,
                      place=fields_place)

test_conn.connect('(' + s +')', is_next=True)
print(test_conn.url)

res = tr.Response(test_conn.response)
res.to_csv(
    lang='es', time=time, verb=v, is_test=True)

In [15]:
finished = []

In [16]:
for topic in topics:
    if topic in finished:
        continue
    
    print(f'Retrieving topic: {topic}')
    
    response = tr.Response(lang='es', topic=topic)
    
    while True:
        es_conn.connect(topic, is_next=True, time_interval=1)
        
        if len(response.schema)==0:
            print(es_conn.url)
        
        new = tr.Response(lang='es', 
                          topic=topic, 
                          response=es_conn.response)
        
#         text_analyzed = analyze(new.schema['data'].loc[:, ['id', 'text']], 'es')
        
#         # Entries without desired verb
#         no_verb = ~(text_analyzed.loc[:, 'lemma'].str.contains(verb))
#         print(f'Found {no_verb.sum()} without "{verb}"')
        
#         new.join(to='data', data=text_analyzed, on='id')
        
#         new.schema['data'].drop(new.schema['data'].loc[no_verb, :].index, inplace=True)
#         response.reset_index()       
            
        response.append(new)
        
        if (response.schema['data'].original.shape[0] >= 500) or (not es_conn.has_next):
            break
    
    response.save_csv(time=time, is_test=True)
    finished.append(topic)

Retrieving topic: Jorge Ramos
https://api.twitter.com/2/tweets/search/all?query=Jorge Ramos lang:es has:geo -is:retweet -has:links &max_results=100&tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets&expansions=author_id,geo.place_id,entities.mentions.username&user.fields=created_at,location,public_metrics&place.fields=country
No next token! ('next_token',)
Before append: 100
After append: 166
166
test/es/05072021-at-2103/es-JORGE RAMOS-original-tweets-166-0.csv
287
test/es/05072021-at-2103/es-JORGE RAMOS-original-users-287-0.csv
117
test/es/05072021-at-2103/es-JORGE RAMOS-original-places-117-0.csv
Retrieving topic: Cárdenas Palomino
No next token! ('next_token',)
https://api.twitter.com/2/tweets/search/all?query=Cárdenas Palomino lang:es has:geo -is:retweet -has:links &max_results=100&tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets&expansions=author_id,geo.place_id,entities.mentions.username&user.fields=created_at,location,public_metrics&place.fields=co

https://api.twitter.com/2/tweets/search/all?query=Neymar lang:es has:geo -is:retweet -has:links &max_results=100&next_token=b26v89c19zqg8o3fpdj6ou1mmm613b27lvmeddjf2w6t9&tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets&expansions=author_id,geo.place_id,entities.mentions.username&user.fields=created_at,location,public_metrics&place.fields=country
Before append: 100
After append: 200
Before append: 200
After append: 300
Before append: 300
After append: 400
Before append: 400
After append: 500
500
test/es/05072021-at-2103/es-NEYMAR-original-tweets-500-0.csv
709
test/es/05072021-at-2103/es-NEYMAR-original-users-709-0.csv
326
test/es/05072021-at-2103/es-NEYMAR-original-places-326-0.csv
Retrieving topic: Superman
https://api.twitter.com/2/tweets/search/all?query=Superman lang:es has:geo -is:retweet -has:links &max_results=100&next_token=b26v89c19zqg8o3fpdg9gamd0nluoyp5oaz86c85oogot&tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets&expansions=author_id,geo.pla

In [None]:
es = tr.Response(tr.retrieve(f'es-{s}-test-data.txt'))
# pt = tr.Response(tr.retrieve(f'pt-{s}-test-data.txt'))

In [None]:
display(es.schema['data'].head(3))
# display(pt.schema['data'].head(3))

In [None]:
from unidecode import unidecode

In [None]:
nlp_es = es_dep_news_trf.load()
# nlp_pt = pt_core_news_lg.load()

In [None]:
def get_pos_tags(tokenized):
    return ' '.join([f'{t.text}-({t.pos_})' for t in tokenized if ((t.pos_!='PUNCT') and (t.pos_!='SPACE'))])

In [None]:
def lemmatize(tokenized):
    return ' '.join([t.lemma_ for t in tokenized])

In [None]:
def analyze(text, lang):
    # Tokenized
    text_nlp = text.loc[:, 'text'].apply(nlp_es if lang=='es' else nlp_pt)
    
    pos = text_nlp.apply(get_pos_tags).rename('pos')
    lemma = text_nlp.apply(lemmatize).rename('lemma')
    
    return pd.concat([text.loc[:, 'id'], pos, lemma], axis=1)

In [None]:
verb = 'decir'
stems = '(dig OR dec OR dij OR dir OR dic)'

In [None]:
response = tr.Response()

In [None]:
es = tr.Response(tr.retrieve(f'es-{s}-test-data.txt'))

In [None]:
# Remove '@...' mentions
es.schema['data'].loc[:, 'text'] = es.schema['data'].loc[:, 'text']\
    .str.replace(r'(@[\w]+ )', '', regex=True)\
    .apply(unidecode)

text_analyzed = analyze(es.schema['data'].loc[:, ['id', 'text']], 'es')

display(text_analyzed.head())

# Entries without desired verb
no_verb = ~(text_analyzed.loc[:, 'lemma'].str.contains(verb))
print(f'Found {no_verb.sum()} without "{verb}"')

display(pd.concat([es.schema['data'].loc[:, 'text'], es.schema['data'].loc[no_verb, :]], axis=1))

es.join(to='data', data=text_analyzed, on='id')

es.schema['data'].drop(es.schema['data'].loc[no_verb, :].index, inplace=True)
response.reset_index()       
    
response.append(es)

response.to_csv('es', verb)

In [None]:
%%time
es_t = es_text.apply(unidecode)
pt_t = pt_text.apply(unidecode)

es_bad = is_bad_verb('es', es_t, 'vi OR ve OR ve')
pt_bad = is_bad_verb('pt', pt_t, 'vi OR ve OR ve')

In [None]:
es_out = pd.concat(
    [es.schema['data'].loc[:, 'text'], es_bad], axis=1)
pt_out = pd.concat(
    [pt.schema['data'].loc[:, 'text'], pt_bad], axis=1)

In [None]:
d.to_csv('es', 'ver')

In [None]:
es_bad.loc[:, 'is_duplicate'].sum()

In [None]:
display(es_out.head())
display(pt_out.head())

In [None]:
b = es_out.loc[~es_bad.loc[:, 'has_verb'], ['text', 'lemmad']]

for i in range(b.shape[0]):
    print(f'CASE:\n')
    print(f'ORIGINAL:\n')
    print(b.iloc[i, 0])
    print(f'LEMMAD:\n')
    print(b.iloc[i, 1])