In [1]:
import json
import re
import numpy as np 
import pandas as pd
from datetime import datetime as dt
from time import sleep

import spacy
spacy.require_gpu()
from torch.utils import dlpack

In [2]:
import importlib.util as imp
import sys

spec = imp.spec_from_file_location(
    'twitter_connection', 
    '../twitter-connection/__init__.py')
twit = imp.module_from_spec(spec)
sys.modules[spec.name] = twit
spec.loader.exec_module(twit)

from twitter_connection import connection as tc
from twitter_connection import response as tr

In [3]:
spec_data = imp.spec_from_file_location(
    'twitter_data',
    '../twitter-connection/twitter_data/__init__.py')
twit = imp.module_from_spec(spec_data)
sys.modules[spec_data.name] = twit
spec_data.loader.exec_module(twit)

from twitter_data import twitter_data, tweets, users, places

In [4]:
'''
File path to the bearer token. Requires a prefix to identify the
  token, which is just 'PERSONAL $BEARER_TOKEN$' by default -- 
  can be specified upon initialization of TwitterConnection
'''
cred_path = r'../twitter-connection/credentials.txt'

In [5]:
verbstem_path = '/home/rimov/Documents/Code/NLP/lin-que-dropping/extraction/verb-stem-clean.txt'

In [6]:
try:
    with open(verbstem_path, 'r') as f:
        verb_stem = json.load(f)
        verb_stem = list(verb_stem.keys())
except FileNotFoundError as e:
    print(f'Couldn\'t find file at location specified: '\
          f'\n{e.args} '\
          f'\n!!!Make sure to adjust custom filepaths!!!')
finally:
    # All the Portuguese-only verbs
    pt_verbs = {'dizer', 'supor', 'duvidar', 'acreditar', 'achar', 'lembrar', 'recear', 'predizer', 'adivinhar', 'conjeturar', 'chutar', 'dar(se) conta', 'desejar', 'oxala', 'tomara'}
    verbs_stative = verb_stem[:22]
    verbs_volit = verb_stem[len(verb_stem)-17:]
    verbs_epistemic = [v for v in verb_stem if (v not in verbs_stative) and (v not in verbs_volit)]
    
    es_verbs = [v for v in verb_stem if v not in pt_verbs]
    es_verbs_stative = [v for v in verbs_stative if v not in pt_verbs]
    es_verbs_volit = [v for v in verbs_volit if v not in pt_verbs]
    es_verbs_epistemic = [v for v in verbs_epistemic if v not in pt_verbs]

In [7]:
es_query_cond = 'lang:es has:geo -is:retweet -has:links '
pt_query_cond = 'lang:pt has:geo -is:retweet -has:links '
fields_tweet = 'tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets'
fields_expan = 'expansions=author_id,geo.place_id,entities.mentions.username'
fields_user = 'user.fields=created_at,location,public_metrics'
fields_place = 'place.fields=country'

In [8]:
es_verbs

['ver',
 'jurar',
 'decir',
 'confesar',
 'prometer',
 'mostrar',
 'demostrar',
 'afirmar',
 'confirmar',
 'conseguir',
 'lograr',
 'responder',
 'admitir',
 'considerar',
 'asegurar',
 'mencionar',
 'gritar',
 'suspirar',
 'reclamar',
 'contar',
 'suponer',
 'saber',
 'pensar',
 'imaginar',
 'dudar',
 'creer',
 'recordar',
 'acordar',
 'temer',
 'recomendar',
 'parecer',
 'entender',
 'negar',
 'apostar',
 'predecir',
 'prever',
 'sentir',
 'comprobar',
 'adivinar',
 'dar(se) cuenta',
 'lamentar',
 'rogar',
 'querer',
 'esperar',
 'desear',
 'pedir',
 'ojala',
 'suplicar',
 'solicitar',
 'mandar',
 'ordenar',
 'insistir',
 'sugerir',
 'preocupar',
 'alegrar']

In [15]:
es_conn = tc.TwitterConnection(
    is_archive=True,
    cred_prefix='PROF')

# pt_conn = tc.TwitterConnection(
#     is_archive=True,
#     cred_prefix='PROF') 

In [16]:
es_conn.set_query(conditions=es_query_cond)
es_conn.set_fields(tweet=fields_tweet, 
                      expansions=fields_expan, 
                      user=fields_user,
                      place=fields_place)

# pt_conn.set_query(conditions=pt_query_cond)
# pt_conn.set_fields(tweet=fields_tweet, 
#                       expansions=fields_expan, 
#                       user=fields_user,
#                       place=fields_place)

In [17]:
time = dt.now().strftime('%d%m%Y-at-%H%M')

In [48]:
topics = ['$AAPL']

In [49]:
finished = []

In [50]:
for topic in topics:
    if topic in finished:
        continue
    
    print(f'Retrieving topic: {topic}')
    
    response = tr.Response(lang='es', topic=topic)
    
    while True:
        es_conn.connect(topic, is_next=True, time_interval=1)
        
        if len(response.schema)==0:
            print(es_conn.url)
        
        new = tr.Response(lang='es', 
                          topic=topic, 
                          response=es_conn.response)      
            
        response.append(new)
        
        if (response.schema['data'].data.shape[0] >= 200) and es_conn.has_next:
            response.save_csv(time=time, pulls=es_conn.pulls, is_test=True)
            es_conn.pulls+=1
            
            response = tr.Response(lang='es', topic=topic)
            
        if not es_conn.has_next:
            break
        
        break
    
    response.save_csv(time=time, pulls=es_conn.pulls, is_test=True)
    es_conn.pulls = 0
    
    finished.append(topic)

Retrieving topic: $AAPL
No next token! ('next_token',)
https://api.twitter.com/2/tweets/search/all?query=$AAPL lang:es has:geo -is:retweet -has:links &max_results=100&tweet.fields=lang,geo,created_at,public_metrics,referenced_tweets&expansions=author_id,geo.place_id,entities.mentions.username&user.fields=created_at,location,public_metrics&place.fields=country
2
test/es/13072021-at-1304/es-$AAPL-original-tweets-2-0.csv
2
test/es/13072021-at-1304/es-$AAPL-original-users-2-0.csv
2
test/es/13072021-at-1304/es-$AAPL-original-places-2-0.csv


### Lemmatization and partitioning

In [24]:
%ls

extraction-test.ipynb  [0m[01;34mtest[0m/                     verb-stem-clean.txt
[01;34msaved[0m/                 twitter-extracting.ipynb  verb-stem-raw.txt


In [25]:
import os
prefix = '/home/rimov/Documents/Code/NLP/lin-que-dropping/extraction/test/es/13072021-at-1304/'

In [35]:
path_tweets_orig = (f for f in os.listdir(prefix) if 'original-tweets' in f)
path_tweets_norm = [f for f in os.listdir(prefix) if 'normalized-tweets' in f]
path_users = (f for f in os.listdir(prefix) if 'users' in f)
path_places = (f for f in os.listdir(prefix) if 'places' in f)

In [36]:
text = None

for i, path in enumerate(path_tweets_orig):
    print(path)
    topic = re.search(r'es-(["\w\s_:]+)-', path).group(1)
    
    normalized = ''
    for n in path_tweets_norm:
        if topic in n:
            normalized = n
    
    if i==0:
        text = tweets.Tweets.from_csv(
            prefix, '~', path, normalized)
        continue
    
    text.append(
        tweets.Tweets.from_csv(
            prefix, '~', path, normalized))

es-PLACE_COUNTRY:MX-original-tweets-100-0.csv


In [39]:
places = None

for i, path in enumerate(path_places):
    print(path)
    topic = re.search(r'es-(["\w\s_:]+)-', path).group(1)
    
    places = pd.read_csv(prefix+path, sep='~')

es-PLACE_COUNTRY:MX-original-places-42-0.csv


In [40]:
display(text.data.head(3))
display(places.head(3))

Unnamed: 0,tweet_id,created_at,author_id,lang,text_orig,referenced_tweets,place_id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,mentions
0,1414896866186125315,2021-07-13T10:37:55.000Z,1027409451890880512,es,@kikemireles Buenos Días Kike 🙋🏽‍♀️ 🙌 ☕,"[{'type': 'replied_to', 'id': '141489595665557...",63b42cf9e0b05dad,0,0,1,0,"[{'start': 0, 'end': 12, 'username': 'kikemire..."
1,1414896717523193860,2021-07-13T10:37:19.000Z,1267332859595042816,es,"@bytelord_mex @Zoroastro_2021 Si, tienes toda ...","[{'type': 'replied_to', 'id': '141462842938379...",e2be9fc867f73d8a,0,0,0,0,"[{'start': 0, 'end': 13, 'username': 'bytelord..."
2,1414896672870633479,2021-07-13T10:37:08.000Z,69162894,es,@ChefOmarSaG B. Seguro dejaste la ropa afuera ...,"[{'type': 'replied_to', 'id': '141474902225494...",7bc29846eabda1f2,0,0,1,0,"[{'start': 0, 'end': 12, 'username': 'ChefOmar..."


Unnamed: 0,country,location,place_id
0,Mexico,"Boca del Río, Veracruz de Ignacio de la Llave",63b42cf9e0b05dad
1,Mexico,"Villa de Tamazulápam del Progreso, Oaxaca",e2be9fc867f73d8a
2,Mexico,"Metepec, México",7bc29846eabda1f2


In [41]:
places['country'].unique()

array(['Mexico'], dtype=object)

In [60]:
processed = text.normalized.loc[:, 'text_normd'].apply(nlp_es)

In [61]:
processed.shape[0]

5160

In [83]:
es_verbs = set(es_verbs)
es_verbs_volit = set(es_verbs_volit)

{'acordar',
 'adivinar',
 'admitir',
 'afirmar',
 'alegrar',
 'apostar',
 'asegurar',
 'comprobar',
 'confesar',
 'confirmar',
 'conseguir',
 'considerar',
 'contar',
 'creer',
 'dar(se) cuenta',
 'decir',
 'demostrar',
 'desear',
 'dudar',
 'entender',
 'esperar',
 'gritar',
 'imaginar',
 'insistir',
 'jurar',
 'lamentar',
 'lograr',
 'mandar',
 'mencionar',
 'mostrar',
 'negar',
 'ojala',
 'ordenar',
 'parecer',
 'pedir',
 'pensar',
 'predecir',
 'preocupar',
 'prever',
 'prometer',
 'querer',
 'reclamar',
 'recomendar',
 'recordar',
 'responder',
 'rogar',
 'saber',
 'sentir',
 'solicitar',
 'sugerir',
 'suplicar',
 'suponer',
 'suspirar',
 'temer',
 'ver'}

In [54]:
def get_verbs(tokenized):
    return [p.lemma_ for p in tokenized if (p.pos_ == 'VERB')]

In [55]:
def get_pos_tags(tokenized):
    return ' '.join([f'{t.text}-({t.pos_})' for t in tokenized if ((t.pos_!='PUNCT') and (t.pos_!='SPACE'))])

In [84]:
def count_verbs(verbs):
    count = 0
    
    for v in verbs:
        if v[1] in es_verbs:
            count+=1
            
    return count

In [85]:
verbs = processed.apply(get_verbs).rename('verbs') 
pos = processed.apply(get_pos_tags).rename('pos')
verbs_volit = verbs.apply(count_verbs).rename('volitional') 

In [86]:
out = pd.concat([text.normalized, pos, verbs, verbs_volit], axis=1)

In [89]:
(out.loc[:, 'volitional']>0).sum()

1502