## HEAD

#### HEAD 01 - toggle user settings

In [1]:
## determine whether to cache data from some time consuming tasks
settings = {
    'collect_data': False, ## toggles twitter api pulls in PULL01-03
    }

#### HEAD02 - load libraries

In [2]:
##########==========##########==========##########==========##########==========
import tweepy
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from time import sleep

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

#### HEAD03 - load data files

In [3]:
## read in roster of handles
user_data = pd.read_excel("A_Input/twitter_handles.xlsx")

## read in twitter credentials; initialize api connection+
twitter_credentials = pd.read_csv('../api_keys/twitter.csv').set_index('item')
twitter_credentials = tweepy.OAuth1UserHandler(
    consumer_key = twitter_credentials.loc['API Key', 'string'],
    consumer_secret = twitter_credentials.loc['API Key Secret', 'string'],
    access_token = twitter_credentials.loc['Access Token', 'string'],
   access_token_secret = twitter_credentials.loc['Access Token Secret', 'string']
    )
api = tweepy.API(twitter_credentials)

  warn(msg)


## HAND

#### HAND01 - extract handles from roster URLs

In [4]:
## extract handles from roster urls
user_data['handle'] = user_data.url.str.replace('https://twitter.com/', '',
            regex = False).str.strip()
user_data = user_data.set_index('handle')

## PULL

#### PULL01 - query API for each roster handle's user_timeline data

In [5]:
## extract tweet data from api object
def refine_tweet_data(x):
    tweet_data = list()
    for i in range(0, len(x)):
        tweet_data.append({
            'tweet_id': x[i].id, 'created_at': x[i].created_at, 'lang': x[i].lang,
            'full_text': x[i].full_text,
            'screen_name': x[i].author.screen_name, 'verified': x[i].author.verified
        })
    return pd.DataFrame(tweet_data).set_index('tweet_id')

## define function to pull user tweet data and apply function to extract tweet data
def pull_tweet_data(x = user_data.index.values, a = api):
    tweet_data = list()
    for i in x:
        try:
            user_tweets = a.user_timeline(
                screen_name = i, count = 200, tweet_mode = 'extended', 
                exclude_replies = True, include_rts = False)
            tweet_data.append(refine_tweet_data(user_tweets))
            sleep(0.5)
        except:
            pass
    return pd.concat(tweet_data)

## execute code
if settings['collect_data']:
    tweet_data = pull_tweet_data()

#### PULL02 - tabulate tweet statistics

In [6]:
## move verified account data from tweet dataset to user dataset
def summarize_verified(td, ud = user_data):
    
    ## calculate tweet summary statistics
    td_original = tweet_data
    td = td.copy().groupby('screen_name').agg({np.mean, len})
    td.columns = td.columns.droplevel(0)
    td = td.rename({'len': 'tweets', 'mean': 'verified'}, axis = 1)
    
    ## merge statistics into the user_data object
    ud = pd.merge(ud, td, left_index = True, right_index = True, how = 'left')
    td_original = td_original.drop(['verified'], axis = 1)
    ud = ud.drop(['url'], axis = 1).reset_index().rename({'index': 'screen_name'},
                axis = 1).set_index('screen_name')
    return ud, td_original

## execute code
if settings['collect_data']:
    user_data, tweet_data = summarize_verified(tweet_data[['screen_name', 'verified']])

#### PULL03 - save datasets to disk

In [7]:
## save user/tweet datasets to disk as csvs
if settings['collect_data']:
    user_data.to_csv('B_Process/user_data.csv')
    tweet_data.to_csv('B_Process/tweet_data.csv')
else:
    user_data = pd.read_csv('B_Process/user_data.csv')
    tweet_data = pd.read_csv('B_Process/tweet_data.csv')

## MUNG

#### MUNG01 - Tokenize and Filter Uncommon Tokens

In [8]:
## tokenize
def nlp_tokenize_tweet(x):
    x = x.lower()
    x = word_tokenize(x)
    return x


## find valid word tokens
def make_word_data(td):

    ## flatten token lists and count occurances
    word_data = list()
    for i in td.words:
        word_data += i
    word_data = pd.Series(word_data, name = 'count').value_counts()
    word_data = word_data.sort_values(ascending = False)
    word_data = pd.DataFrame(word_data)
    
    ## determine which tokens occur often enough to warrant inclusion
    word_data['valid'] = word_data['count'] > word_data['count'].quantile(0.2)
    word_data['word'] = word_data.index
    
    ## determine part of speech for eligible tokens
    speech_part = word_data['word'].loc[word_data['valid']].values
    speech_part = pos_tag(speech_part)
    speech_part = [i[1][0].lower() for i in speech_part]
    word_data['pos'] = '.'
    word_data.loc[word_data['valid'], 'pos'] = speech_part
    
    ## lemmatize
    WNL = WordNetLemmatizer()
    word_data['token'] = None
    for i in word_data.index:
        if not word_data.loc[i, 'valid']: 
            break
        if word_data.loc[i, 'pos'] in 'abcdefghijklmnopqrstuvwxyz':
            try:
                word_data.loc[i, 'token'] = WNL.lemmatize(
                    word_data.loc[i, 'word'],
                    pos = word_data.loc[i, 'pos']
                )
            except:
                word_data.loc[i, 'token'] = word_data.loc[i, 'word']
        else:
            word_data.loc[i, 'valid'] = False
        
    return word_data


## convert tweet words to tokens
print('TODO: MAKE THIS FUNCTION')

## execute code
tweet_data['words'] = tweet_data.full_text.apply(nlp_tokenize_tweet)
word_data = make_word_data(td = tweet_data)

TODO: MAKE THIS FUNCTION


In [9]:
word_data

Unnamed: 0,count,valid,word,pos,token
.,164533,False,.,.,
the,151741,True,the,d,the
to,132099,True,to,t,to
",",102693,False,",",",",
:,98680,False,:,:,
...,...,...,...,...,...
fast-moving,1,False,fast-moving,.,
//t.co/iimypclyn0,1,False,//t.co/iimypclyn0,.,
effected,1,False,effected,.,
//t.co/mpc6xl1e3h,1,False,//t.co/mpc6xl1e3h,.,


In [11]:
tweet_data

Unnamed: 0,tweet_id,created_at,lang,full_text,screen_name,words
0,1550834863804219394,2022-07-23 13:26:59+00:00,en,Don’t forget: Early Voting is is open until 3P...,RepAdams,"[don, ’, t, forget, :, early, voting, is, is, ..."
1,1550577253561647104,2022-07-22 20:23:20+00:00,en,Enjoying the Brotherhood of the Omega Psi Phi ...,RepAdams,"[enjoying, the, brotherhood, of, the, omega, p..."
2,1550286139185061889,2022-07-22 01:06:33+00:00,en,"On #January6th, Secret Service Agents assigned...",RepAdams,"[on, #, january6th, ,, secret, service, agents..."
3,1550218886309728256,2022-07-21 20:39:18+00:00,en,"I'm sick and tired of being sick and tired, an...",RepAdams,"[i, 'm, sick, and, tired, of, being, sick, and..."
4,1550199027710541824,2022-07-21 19:20:24+00:00,en,The Office of Congresswoman Alma Adams is hono...,RepAdams,"[the, office, of, congresswoman, alma, adams, ..."
...,...,...,...,...,...,...
98673,1507035242145763331,2022-03-24 16:42:55+00:00,en,"Another MUST SEE:\n\n ""He's (@JustinTrudeau) e...",bobzimmermp,"[another, must, see, :, ``, he, 's, (, @, just..."
98674,1507033606799171587,2022-03-24 16:36:25+00:00,en,"“Trudeau is, without a doubt, the most polariz...",bobzimmermp,"[“, trudeau, is, ,, without, a, doubt, ,, the,..."
98675,1507032252643000326,2022-03-24 16:31:02+00:00,en,A MUST WATCH https://t.co/aZ1vfUUlYz,bobzimmermp,"[a, must, watch, https, :, //t.co/az1vfuulyz]"
98676,1506387676655759365,2022-03-22 21:49:44+00:00,en,"As a fellow northerner, was honoured to have m...",bobzimmermp,"[as, a, fellow, northerner, ,, was, honoured, ..."
