# Text Mine Twitter - NFL Search Patterns - PLAYER


Ryan Timbrook (RTIMBROO)  
DATE:11/30/2019 <br>
Topic: Search Twitter for tweets on specific NFL Players, Coaches, and Teams


## 1. Objective
_____________________________________________________________________________________________
Capture popular opinion of peoples tweets on certain NFL characters. 
Create a corpus of tweets for sentiment analysis


______________________________________________________________________________________________
### Coding Environment Setup
Import packages

In [None]:
# import packages for analysis and modeling
import pandas as pd #data frame operations
import numpy as np #arrays and math functions
import requests
import os
import io
import pickle
import re
import sys
import string
import json

from os import path
from datetime import date
from datetime import time
from datetime import datetime

# Twython packages for twitter
from twython import Twython

In [None]:
# custome python packages
import rtimbroo_utils as br             # custome python helper functions

In [None]:
# set global properties
notebook_file_name = 'search_twitter_nfl_player_premium'
report_file_name = 'search_twitter_nfl_player_premium'
app_name = 'search_twitter_nfl_player_premium'
log_level = 10 # 10-DEBUG, 20-INFO, 30-WARNING, 40-ERROR, 50-CRITICAL

# setup working directory structure
# set global properties
dataDir = './data'
outputDir = './output'
configDir = './config'
logOutDir = './logs'
imageDir = './images'
modelDir = './models'
corpusDir = './corpus'
# create base output directories if they don't exist
if not os.path.exists(outputDir): os.mkdir(outputDir)
if not os.path.exists(logOutDir): os.mkdir(logOutDir)
if not os.path.exists(imageDir): os.mkdir(imageDir)
if not os.path.exists(modelDir): os.mkdir(modelDir)
if not os.path.exists(dataDir): os.mkdir(dataDir)
if not os.path.exists(configDir): os.mkdir(configDir)
if not os.path.exists(corpusDir): os.mkdir(corpusDir)

In [None]:
# get a logger for troubleshooting / data exploration
logger = br.getFileLogger(logOutDir+'/',app_name,level=log_level)

In [None]:
# get current date
now = datetime.utcnow().isoformat()
collection_date = re.findall('^[0-9]{4}-[0-9]{2}-[0-9]{2}',now)
collection_date = collection_date[0]
collection_date

## 2. OBTAIN the data   
________________________________________________________________________________________________
Import external datasets for evaluation

#### Twitter Search API Limits:
[Sandbox Package](https://developer.twitter.com/en/pricing/search-fullarchive)

* Time frame:	Full history
* Tweets per request:	100
* Counts vs. data:	Data only
* Query length:	128 characters
* Operator availability:	Standard
* Rate limit per minute:	30 requests/min
* Enrichments:	n/a
* Dev environments:	1	
* Monthly Tweet cap:	5k	
* Rate limit per second: 10 requests/sec

### Instantiate Twitter API Object
Using Twython 3.6 Twitter API Wrapper
[Twython 3.6.0 reference documentation](https://twython.readthedocs.io/en/latest/api.html)

Twython, currently, has two main interfaces:

* Twitter’s Core API (updating statuses, getting timelines, direct messaging, etc)
* Twitter’s Streaming API<br>

**Core Interface**<br>
class twython.Twython(app_key=None, app_secret=None, oauth_token=None, oauth_token_secret=None, access_token=None, token_type='bearer', oauth_version=1, api_version='1.1', client_args=None, auth_endpoint='authenticate')

__init__(app_key=None, app_secret=None, oauth_token=None, oauth_token_secret=None, access_token=None, token_type='bearer', oauth_version=1, api_version='1.1', client_args=None, auth_endpoint='authenticate')

Parameters:	
* app_key – (optional) Your applications key
* app_secret – (optional) Your applications secret key
* oauth_token – (optional) When using OAuth 1
* client_args – (optional) Accepts some requests Session parameters
* auth_endpoint – (optional) Lets you select which authentication

In [None]:
# load twitter credentials
with open(f'{configDir}/twitter_credentials.json', 'r') as f:
    tw_cred = json.load(f)

# setup client header arguments to pass along to the API
client_args = {
    'headers':{
        'User-Agent': 'AI_Public_Sentiment'
    },
    'timeout':300,
    
    
}
# test search urls
base_url = "https://api.twitter.com/1.1/tweets/search/"

# search_basic_standard_7day_free
dev_7day_standard_url = "https://api.twitter.com/1.1/tweets/search/tweets.json"

#  --- Premium API format: -- POST /search/:product/:label --- #
# search_tweets_30_day_dev
dev_30day_sandbox_url = "https://api.twitter.com/1.1/tweets/search/30day/sandbox.json"

# search_tweets_fullarchive_dev
dev_full_archive_url = "https://api.twitter.com/1.1/tweets/search/fullarchive/devfullarchive.json"

# search_tweets_fullarchive_prod
prod_full_archive_url = "https://api.twitter.com/1.1/tweets/search/fullarchive/tweets.json"

Twython Object Resources:
* [twython api.py](https://github.com/ryanmcgrath/twython/blob/master/twython/api.py)

In [None]:
# instantiate Twython object instance for premium search - key is the auth_endpoint equal to 'authorize'
py_tweets_premium =  Twython(   
    #app_key=tw_cred['CONSUMER_KEY'],
    #app_secret=tw_cred['CONSUMER_SECRET'],
    #oauth_token=tw_cred['ACCESS_TOKEN'],
    #oauth_token_secret=tw_cred['ACCESS_SECRET'],
    access_token=tw_cred['BEARER_TOKEN'],
    token_type='bearer',
    auth_endpoint='authorize',                  
    oauth_version=2,
    client_args=client_args)


#logger.info(f'{test_twython.verify_credentials()}')

### Configure Search Terms

In [None]:
# set what to search on
nfl_type = 'player'
search_on = 'deshaun_watson'

# setup base twitter search query
search_terms="deshaun watson"

# add filters to search criteria
#filtered_search_terms = search_terms + " -filter:retweets"

#search_start_date = '2019-11-23' # limits to the last 7 days

# number of tweets to return
#num_tweets = 100 # sandbox rate limit - 100 tweets per request

### Method	Description
* POST /search/:product/:label	
    * Retrieve Tweets matching the specified query.
* POST /search/:product/:label/counts	
    * Retrieve the number of Tweets matching the specified query.

Where:
* :product indicates the search endpoint you are making requests to, either 30day or fullarchive.
* :label is the (case-sensitive) label associated with your search developer environment, as displayed at https://developer.twitter.com/en/account/environments.

For example, if using the 30-day endpoint and your dev environment has a label of 'dev' (short for development), the search URLs would be:


In [None]:
# configure date search ranges
# search dates (from_date, to_date) - Sunday to Sunday
# --- Complete list of the 17 week NFL schedule --- #
search_date_ranges = [
    ('201909010000','201909080000'),      # week 1  --- dev_full_archive_url
    ('201909080000','201909150000'),      # week 2  --- dev_full_archive_url
    ('201909150000','201909220000'),      # week 3  --- dev_full_archive_url
    ('201909220000','201909290000'),      # week 4  --- dev_full_archive_url
    ('201909290000','201910060000'),      # week 5  --- dev_full_archive_url
    ('201910060000','201910130000'),      # week 6  --- dev_full_archive_url
    ('201910130000','201910200000'),      # week 7  --- dev_full_archive_url
    ('201910200000','201910270000'),      # week 8  --- dev_full_archive_url
    ('201910270000','201911030000'),      # week 9  --- dev_full_archive_url
    ('201911030000','201911100000'),      # week 10 --- dev_30day_sandbox_url
    ('201911100000','201911170000'),      # week 11 --- dev_30day_sandbox_url
    ('201911170000','201911240000'),      # week 12 --- dev_30day_sandbox_url
    ('201911240000','201912010000'),      # week 13 --- dev_30day_sandbox_url
    #('201912010000','201912080000'),     # week 14 
    #('201912080000','201912150000'),     # week 15
    #('201912150000','201912220000'),     # week 16
    #('201912220000','201912290000'),     # week 17
                ]

# 30day Search Range
search_30day = search_date_ranges[-4:]
logger.debug(f'{search_30day}')

# full archive search range
search_fullarchive = search_date_ranges[:len(search_date_ranges)-4]
logger.debug(f'{search_fullarchive}')

In [None]:
def convert_str_date(str_date):
    import time
    day_of_week = str_date.split(' ')[0]
    month = str_date.split(' ')[1]
    day_of_month = str_date.split(' ')[2]
    year = str_date.split(' ')[-1]
    time_of_day = str_date.split(' ')[3]


    new_str_date = f'{month} {day_of_month}, {year}'
    ts = time.strptime(new_str_date, '%b %d, %Y')
    new_ds_str = f'{ts.tm_year}-{ts.tm_mon}-{ts.tm_mday}'

    return new_ds_str, time_of_day, str(ts.tm_year), str(ts.tm_mon), str(ts.tm_mday), str(ts.tm_wday)
    

In [None]:
#convert_str_date('Sat Nov 23 21:24:19 +0000 2019')

In [None]:
'''
Function: used for standard basic search
Returns:
'''
def config_query(search_term,since=None,until=None,count=100,lang='en',result_type='mixed'):
    
    # query
    search = {
        'q':search_term,
        'since':since,              # from_date
        'until':until,              # Date format YYYY-MM-DD - returns tweets created before the given date
        'lang':lang,
        'result_type':result_type,    # mixed, recent, popular
        'count':count,                # max is 100, defult is 15 per page

        #'since_id': ,              # returns results with an ID more recent than the specified ID - if the limit of Tweets has occured since the since_id, the since_id will be forced to the oldest ID available
        #'max_id': ,                # returns results with an ID older than or equal to the specified ID
    }
    
    logger.debug(f'config_query: search:\n{search}')
    return search

In [None]:
'''
Util Function to build Twitter JSON API params
'''
def insert_str(string, str_to_insert, index):
    return string[:index] + str_to_insert + string[index:]

def make_search_params_str(q,fromDate,toDate,maxResults,next_page=None):
    params=''
    
    if next_page is None:
        params = '\"query\":\"{search_terms}\",\"fromDate\":\"{fromDate}\",\"toDate\":\"{toDate}\", \"maxResults\":{maxResults}'.format(
            search_terms=q, 
            fromDate=fromDate, 
            toDate=toDate, 
            maxResults=maxResults,
        )
    else:
        params = '\"query\":\"{search_terms}\",\"fromDate\":\"{fromDate}\",\"toDate\":\"{toDate}\", \"maxResults\":{maxResults},\"next\":\"{next_page}\"'.format(
            search_terms=q, 
            fromDate=fromDate, 
            toDate=toDate, 
            maxResults=maxResults,
            next_page=next_page
        )
    
    params = insert_str(params,'{',0)
    params = insert_str(params,'}',len(params))
    
    logger.debug(f'{params}')
    return params

In [None]:
'''
Function: Cleans tweet text of URLs, HashTags and @Tags
Returns: Cleaned text, lists of URLs, HasTags and @Tags
'''
def clean_tweet_text_meta(text):
    
    cleaned_text = []
    new_text = None
    urls = []
    hash_tags = []
    at_tags = []
    
    re_hash = re.compile('^#..+')
    re_url = re.compile('^http*')
    re_at = re.compile('^@.+')
    punc_transtable = str.maketrans('', '', string.punctuation)
    
    # loop over tokens collecting meta info and cleaning text
    if len(text) > 0:
        text = text.replace('\n',' ')
        tokens = text.split(' ')
        logger.info(f'tokens: {tokens}')
        
        for tok in tokens:
            
            try:
                # match Hash Tags
                if re.match(re_hash,tok):
                    hash_tags.append(tok.translate(punc_transtable))
                    # hashtags have meaning in tweet text. remove the # character, keep the rest of the text
                    cleaned_text.append(tok[1:])
                # match URLs
                elif re.match(re_url, tok):
                    urls.append(tok)
                # match @Tags
                elif re.match(re_at,tok):
                    at_tags.append(tok)
                # keep text in original format for later cleaning techniques
                else:
                    cleaned_text.append(tok)
                
            except BaseException as be:
                logger.warning(f'clean_tweet_text_meta: ***WARNING*** Caught BaseException: {be}')
            
    else:
        logger.warning(f'clean_tweet_text_meta: ***WARNING***: text is empty: {text}')
    
    logger.debug(f'clean_tweet_text_meta: cleaned tokens: {cleaned_text}')
    # join cleaned text back together
    new_text = ' '.join(cleaned_text)
    
    return new_text, urls, hash_tags, at_tags
    

In [None]:
# test clean_tweet_text_v2
#test_tweet = "Attention, I LOVE WHAT I DO, Thats all Carry On!!! #bullsonparade. #Texans #1 in the Division Houston TEXANS @ Humbâ€¦ https://t.co/qZOYnP424X"
#test_tweet2 = "@NYYfan2442 @_jliendro @OkcGhost @FieldYates Would be tough to beat KC but we could beat Houston. Not really impresâ€¦ https://t.co/pK2avVllxc"
#tw = clean_tweet_text_meta(test_tweet2)

Twython Resource Docs:
* [Setup Twython To Post Tweet And Search Tweets](https://code.luasoftware.com/tutorials/python/setup-twython-to-post-tweet-and-search-tweets/)

In [None]:
#search_results = None
def search_twitter(endpoint,params):
    logger.debug(f'search_twitter: endpoint:{endpoint} | params:{params}')
    search_results = py_tweets_premium.post(endpoint,params)
    logger.debug(f'keys: {search_results.keys()}')
    #return search_results

In [None]:
'''
Function Description: 
'''

def page_search(endpoint,q,fromDate,toDate,maxResults,text_file,raw_file,next_page=None):
    logger.debug(f'page_search: {endpoint} | {q} | {fromDate} | {toDate} | {maxResults} | {text_file} | {raw_file} | {next_page}')
    
    params = make_search_params_str(search_terms,fromDate,toDate,maxResults,next_page=None)

    #search_results = search_twitter(endpoint,params)
    _results = py_tweets_premium.post(endpoint,params)
    logger.debug(f'page_search: result length: {len(_results["results"])}')

    
    next_page = None
    try:
        next_page = _results['next']
    except:
        logger.debug(f'page_search: no next pages...')
        #logger.info(f'next_page: {next_page}')
    
    # search tweets
    tweets_dict = {'id':[],'created_at':[],'date':[],'time':[],'user':[],'text':[],'favorite_count':[], 'year':[], 'month':[], 'day_of_month':[], 'day_of_week':[]}
    tweets_text_metadata_dict = {'id':[],'date':[],'user':[],'urls':[],'hash_tags':[],'at_tags':[]}
    page_cnt = 0
    result_cnt = 0
    with io.open(f'{text_file}', 'a',encoding='utf8') as f:
        with io.open(f'{raw_file}','a',encoding='utf8') as r:
            try: 
                # page is a list of twitter results
                for i, result in enumerate(_results['results']):
                    page_cnt +=1
                    logger.debug(f'Page: [{i}]')
                    try:
                        
                        try:
                            logger.debug(f'{result["id_str"]} | {result["user"]["screen_name"]} | {result["created_at"]} | {result["text"]} | {result["user"]["favourites_count"]}')
                        except BaseException as be:
                            logger.warning(f'page_search: ***WARNING***: Caught BaseException writing debug log file: {be}')

                        # dump raw tweet to file as json
                        #raw_tweet = json.load(result)
                        dump = json.dumps(result)
                        r.write(dump)
                        r.write('\n')
                                         
                        # if tweet_mode='extended', use _result['full_text']
                        text = ''
                        try:
                            text = result['retweeted_status']["extended_tweet"]['full_text']
                        except BaseException as be:
                            logger.warning(f'page_search: NO full_text: {be}')
                            text = result['text']

                        # dump tweet text to file
                        f.write(f'{result["id_str"]} {text}')
                        f.write('\n')

                        # add key attributes to tweets dictionary as return results
                        tweets_dict['id'].append(result["id_str"])
                        tweets_dict['created_at'].append(result["created_at"])
                        tweets_dict['favorite_count'].append(result["user"]["favourites_count"])

                        # call function to parse string date
                        date_time = convert_str_date(result["created_at"]) # get datetime components

                        tweets_dict['date'].append(date_time[0])
                        tweets_dict['time'].append(date_time[1])        
                        tweets_dict['user'].append(result["user"]["screen_name"])

                        # call function to parse text for metadata
                        clean_text = clean_tweet_text_meta(text)

                        tweets_dict['text'].append(clean_text[0])

                        # create dictionary of tweet text metadata
                        tweets_text_metadata_dict['id'].append(result["id_str"])
                        tweets_text_metadata_dict['date'].append(date_time[0])
                        tweets_text_metadata_dict['user'].append(result["user"]["screen_name"])
                        tweets_text_metadata_dict['urls'].append(clean_text[1])
                        tweets_text_metadata_dict['hash_tags'].append(clean_text[2])
                        tweets_text_metadata_dict['at_tags'].append(clean_text[3])

                        # track timeseries attributes for granular reporting and visualizations
                        tweets_dict['year'].append(date_time[2])
                        tweets_dict['month'].append(date_time[3])
                        tweets_dict['day_of_month'].append(date_time[4])
                        tweets_dict['day_of_week'].append(date_time[5])

                        #break

                    except BaseException as be:
                        logger.warning(f'**WARNING** Caught BaseException: {be}')
                                             
            except BaseException as be:
                logger.warning(f'**WARNING** Caught BaseException: {be}')
    
    logger.info(f'page_search: processed page_cnt:[{page_cnt}] | total result_cnt: [{result_cnt}] | next_page: {next_page}')
    
    return pd.DataFrame.from_dict(tweets_dict), pd.DataFrame.from_dict(tweets_text_metadata_dict), next_page
    

## Execute Twitter Search

[Twython 3.6.0 reference documentation](https://twython.readthedocs.io/en/latest/api.html)

In [None]:
# sets the search iteration for directory creation - this way we won't overwrite potentially useful data already created
# set this once, then comment out
search_iteration = 0

## Execute 30day Search

In [None]:
'''
Execute Twitter search by pre-configured date ranges
------  30day Search ------

'''
# static search terms
endpoint=dev_30day_sandbox_url
search_terms = "deshaun watson"
maxResults = 100

search_iteration +=1 # each time this block of code is ran, the search iteration will update and create a new output directory structure
search_range_results_df = pd.DataFrame()
search_tweets_text_meta_df = pd.DataFrame()

# execute search by date ranges
for dates in search_30day:
    search_range = f'{dates[0]}_{dates[1]}'
    logger.info(f'search_range: {search_range}')
    
    # output file names based on date range search
    outputPath = f'{dataDir}/{nfl_type}/{search_on}/v{search_iteration}/{search_range}'
    if not os.path.exists(outputPath): os.makedirs(outputPath)
        
    tweet_filename=f'{outputPath}/tweet_text.txt'
    raw_filename=f'{outputPath}/tweet_raw.txt'
        
    if not os.path.exists(f'{tweet_filename}'): open(f'{tweet_filename}', 'a').close()
    if not os.path.exists(f'{raw_filename}'): open(f'{raw_filename}', 'a').close()
  
    
    #------- EXECUTES TWITTER SEARCH -------------------#
    result_df = page_search(endpoint,search_terms,dates[0],dates[1],maxResults,tweet_filename,raw_filename) 
    
    # loop through pages
    page_cnt = 0
    while(len(result_df[2]) > 2):
        page_cnt +=1
        logger.info(f'next page search: page_cnt: {page_cnt}')
        result_df = page_search(endpoint,search_terms,dates[0],dates[1],maxResults,tweet_filename,raw_filename,result_df[2]) 
        
        # merge dataframes - complete table of search results collected and written out to csv file in code block below
        search_range_results_df = search_range_results_df.append(result_df[0], ignore_index=True)
        search_tweets_text_meta_df = search_tweets_text_meta_df.append(result_df[1], ignore_index=True)
        if page_cnt == 4: break
    
    logger.debug(f'search result df: \n{result_df}')
    
    # merge dataframes - complete table of search results collected and written out to csv file in code block below
    search_range_results_df = search_range_results_df.append(result_df[0], ignore_index=True)
    search_tweets_text_meta_df = search_tweets_text_meta_df.append(result_df[1], ignore_index=True)
    
    #break

## Execute full archive search

In [None]:
'''
Execute Twitter search by pre-configured date ranges
------  Full Archive Search ------

'''
# static search terms
endpoint=dev_full_archive_url
search_terms = "deshaun watson"
maxResults = 500

#search_iteration +=1 # each time this block of code is ran, the search iteration will update and create a new output directory structure
#search_range_results_df = pd.DataFrame()
#search_tweets_text_meta_df = pd.DataFrame()

# execute search by date ranges
for dates in search_fullarchive:
    search_range = f'{dates[0]}_{dates[1]}'
    logger.info(f'search_range: {search_range}')
    
    # output file names based on date range search
    outputPath = f'{dataDir}/{nfl_type}/{search_on}/v{search_iteration}/{search_range}'
    if not os.path.exists(outputPath): os.makedirs(outputPath)
        
    tweet_filename=f'{outputPath}/tweet_text.txt'
    raw_filename=f'{outputPath}/tweet_raw.txt'
        
    if not os.path.exists(f'{tweet_filename}'): open(f'{tweet_filename}', 'a').close()
    if not os.path.exists(f'{raw_filename}'): open(f'{raw_filename}', 'a').close()
    
    #------- EXECUTES TWITTER SEARCH -------------------#
    result_df = page_search(endpoint,search_terms,dates[0],dates[1],maxResults,tweet_filename,raw_filename) 
    
    # loop through pages
    page_cnt = 0
    while(len(result_df[2]) > 2):
        page_cnt +=1
        logger.info(f'next page search: page_cnt: {page_cnt}')
        result_df = page_search(endpoint,search_terms,dates[0],dates[1],maxResults,tweet_filename,raw_filename,result_df[2]) 
        
        # merge dataframes - complete table of search results collected and written out to csv file in code block below
        search_range_results_df = search_range_results_df.append(result_df[0], ignore_index=True)
        search_tweets_text_meta_df = search_tweets_text_meta_df.append(result_df[1], ignore_index=True)
        if page_cnt == 4: break
    
    logger.debug(f'search result df: \n{result_df}')
    
    # merge dataframes - complete table of search results collected and written out to csv file in code block below
    search_range_results_df = search_range_results_df.append(result_df[0], ignore_index=True)
    search_tweets_text_meta_df = search_tweets_text_meta_df.append(result_df[1], ignore_index=True)
    
    #break

In [None]:
search_range_results_df.head()

In [None]:
search_tweets_text_meta_df.head()

In [None]:
sorted_search_df = search_range_results_df.sort_values(by=['month','day_of_month','day_of_week'], ascending=True)
sorted_search_df.head(20)

In [None]:
logger.info(f'search_range_results_df shape: {search_range_results_df.shape}')

In [None]:
logger.info(f'x-rate-limit-remaining: [{py_tweets.get_lastfunction_header("x-rate-limit-remaining")}]')
logger.info(f'home_timeline: {py_tweets.get_home_timeline()}')          

## Save Full DataFrame of search results to csv

In [None]:
outputPath = f'{dataDir}/{nfl_type}/{search_on}/v{search_iteration}'
search_range_results_df.to_csv(f'{outputPath}/search_result_tweet_text_data.csv', index=False)
search_tweets_text_meta_df.to_csv(f'{outputPath}/search_result_tweet_text_meta.csv', index=False)