# Text Mine Twitter - NFL Search Patterns - TEAM


Ryan Timbrook (RTIMBROO)  
DATE:11/30/2019 <br>
Topic: Search Twitter for tweets on specific NFL Players, Coaches, and Teams


## 1. Objective
_____________________________________________________________________________________________
Capture popular opinion of peoples tweets on certain NFL characters. 
Create a corpus of tweets for sentiment analysis


______________________________________________________________________________________________
### Coding Environment Setup
Import packages

In [1]:
# import packages for analysis and modeling
import pandas as pd #data frame operations
import numpy as np #arrays and math functions
import requests
import os
import io
import pickle
import re
import sys
import string
from os import path
from datetime import date
from datetime import time
from datetime import datetime

In [2]:
# packages for twitter
import tweepy as tw
from tweepy import OAuthHandler
import json
from tweepy import Stream
from tweepy.streaming import StreamListener

# Twython packages for twitter
from twython import Twython

# packages for NLTK
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer

In [3]:
# custome python packages
import rtimbroo_utils as br             # custome python helper functions

In [4]:
# set global properties
notebook_file_name = 'search_twitter_nfl_team'
report_file_name = 'search_twitter_nfl_team'
app_name = 'search_twitter_nfl_team'
log_level = 10 # 10-DEBUG, 20-INFO, 30-WARNING, 40-ERROR, 50-CRITICAL

# setup working directory structure
# set global properties
dataDir = './data'
outputDir = './output'
configDir = './config'
logOutDir = './logs'
imageDir = './images'
modelDir = './models'
corpusDir = './corpus'
# create base output directories if they don't exist
if not os.path.exists(outputDir): os.mkdir(outputDir)
if not os.path.exists(logOutDir): os.mkdir(logOutDir)
if not os.path.exists(imageDir): os.mkdir(imageDir)
if not os.path.exists(modelDir): os.mkdir(modelDir)
if not os.path.exists(dataDir): os.mkdir(dataDir)
if not os.path.exists(configDir): os.mkdir(configDir)
if not os.path.exists(corpusDir): os.mkdir(corpusDir)

In [5]:
# get a logger for troubleshooting / data exploration
logger = br.getFileLogger(logOutDir+'/',app_name,level=log_level)

In [6]:
# get current date
now = datetime.utcnow().isoformat()
collection_date = re.findall('^[0-9]{4}-[0-9]{2}-[0-9]{2}',now)
collection_date = collection_date[0]
collection_date

'2019-12-02'

## 2. OBTAIN the data   
________________________________________________________________________________________________
Import external datasets for evaluation

#### Twitter Search API Limits:
[Sandbox Package](https://developer.twitter.com/en/pricing/search-fullarchive)

* Time frame:	Full history
* Tweets per request:	100
* Counts vs. data:	Data only
* Query length:	128 characters
* Operator availability:	Standard
* Rate limit per minute:	30 requests/min
* Enrichments:	n/a
* Dev environments:	1	
* Monthly Tweet cap:	5k	
* Rate limit per second: 10 requests/sec

### Instantiate Twitter API Object
Using Twython 3.6 Twitter API Wrapper
[Twython 3.6.0 reference documentation](https://twython.readthedocs.io/en/latest/api.html)

In [7]:
# load twitter credentials
with open(f'{configDir}/twitter_credentials.json', 'r') as f:
    tw_cred = json.load(f)

# setup client header arguments to pass along to the API
client_args = {
    'headers':{
        'User-Agent': 'AI_Public_Sentiment_16860838'
    },
    'timeout':300,
    
}
    
# instantiate object
py_tweets = Twython(tw_cred['CONSUMER_KEY'],
                    tw_cred['CONSUMER_SECRET'],
                    tw_cred['ACCESS_TOKEN'],
                    tw_cred['ACCESS_SECRET'],
                   client_args=client_args)

logger.debug(f'{py_tweets.verify_credentials()}')
logger.debug(f'{py_tweets.get_home_timeline()}')

### Configure Search Terms

In [8]:
# set what to search on
nfl_type = 'team'
search_on = 'houston_texans'

# setup base twitter search query
search_terms="houston texans"

# add filters to search criteria
filtered_search_terms = search_terms + " -filter:retweets"

#search_start_date = '2019-11-23' # limits to the last 7 days

# number of tweets to return
num_tweets = 100 # sandbox rate limit - 100 tweets per request

In [9]:
# search dates (from_date, to_date) - Sunday to Sunday
# --- Complete list of the 17 week NFL schedule --- #
search_date_ranges = [
    #('2019-9-1','2019-9-8'),      # week 1
    #('2019-9-8','2019-9-15'),     # week 2
    #('2019-9-15','2019-9-22'),    # week 3
    #('2019-9-22','2019-9-29'),    # week 4
    #('2019-9-29','2019-10-6'),    # week 5
    ('2019-10-6','2019-10-13'),   # week 6
    ('2019-10-13','2019-10-20'),  # week 7
    ('2019-10-20','2019-10-27'),  # week 8
    ('2019-10-27','2019-11-3'),   # week 9
    ('2019-11-3','2019-11-10'),   # week 10
    ('2019-11-10','2019-11-17'),  # week 11
    ('2019-11-17','2019-11-24'),  # week 12
    ('2019-11-24','2019-12-1'),   # week 13
    #('2019-12-1','2019-12-8'),    # week 14
    #('2019-12-8','2019-12-15'),   # week 15
    #('2019-12-15','2019-12-22'),  # week 16
    #('2019-12-22','2019-12-29'),  # week 17
                ]

In [10]:
def convert_str_date(str_date):
    import time
    day_of_week = str_date.split(' ')[0]
    month = str_date.split(' ')[1]
    day_of_month = str_date.split(' ')[2]
    year = str_date.split(' ')[-1]
    time_of_day = str_date.split(' ')[3]


    new_str_date = f'{month} {day_of_month}, {year}'
    ts = time.strptime(new_str_date, '%b %d, %Y')
    new_ds_str = f'{ts.tm_year}-{ts.tm_mon}-{ts.tm_mday}'

    return new_ds_str, time_of_day, str(ts.tm_year), str(ts.tm_mon), str(ts.tm_mday), str(ts.tm_wday)
    

In [None]:
#convert_str_date('Sat Nov 23 21:24:19 +0000 2019')

In [11]:
'''
Function:
Returns:
'''
def config_query(search_term,since=None,until=None,count=100,lang='en',result_type='mixed'):
    
    # query
    search = {
        'q':search_term,
        'since':since,              # from_date
        'until':until,              # Date format YYYY-MM-DD - returns tweets created before the given date
        'lang':lang,
        'result_type':result_type,    # mixed, recent, popular
        'count':count,                # max is 100, defult is 15 per page

        #'since_id': ,              # returns results with an ID more recent than the specified ID - if the limit of Tweets has occured since the since_id, the since_id will be forced to the oldest ID available
        #'max_id': ,                # returns results with an ID older than or equal to the specified ID
    }
    
    logger.debug(f'config_query: search:\n{search}')
    return search

In [12]:
'''
Function: Cleans tweet text of URLs, HashTags and @Tags
Returns: Cleaned text, lists of URLs, HasTags and @Tags
'''
def clean_tweet_text_meta(text):
    
    cleaned_text = []
    new_text = None
    urls = []
    hash_tags = []
    at_tags = []
    
    re_hash = re.compile('^#..+')
    re_url = re.compile('^http*')
    re_at = re.compile('^@.+')
    punc_transtable = str.maketrans('', '', string.punctuation)
    
    # loop over tokens collecting meta info and cleaning text
    if len(text) > 0:
        tokens = text.split(' ')
        logger.info(f'tokens: {tokens}')
        
        for tok in tokens:
            
            try:
                # match Hash Tags
                if re.match(re_hash,tok):
                    hash_tags.append(tok.translate(punc_transtable))
                    # hashtags have meaning in tweet text. remove the # character, keep the rest of the text
                    cleaned_text.append(tok[1:])
                # match URLs
                elif re.match(re_url, tok):
                    urls.append(tok)
                # match @Tags
                elif re.match(re_at,tok):
                    at_tags.append(tok)
                # keep text in original format for later cleaning techniques
                else:
                    cleaned_text.append(tok)
                
            except BaseException as be:
                logger.warning(f'clean_tweet_text_meta: ***WARNING*** Caught BaseException: {be}')
            
    else:
        logger.warning(f'clean_tweet_text_meta: ***WARNING***: text is empty: {text}')
    
    logger.info(f'clean_tweet_text_meta: cleaned tokens: {cleaned_text}')
    # join cleaned text back together
    new_text = ' '.join(cleaned_text)
    
    return new_text, urls, hash_tags, at_tags
    

In [None]:
# test clean_tweet_text_v2
#test_tweet = "Attention, I LOVE WHAT I DO, Thats all Carry On!!! #bullsonparade. #Texans #1 in the Division Houston TEXANS @ Humbâ€¦ https://t.co/qZOYnP424X"
#test_tweet2 = "@NYYfan2442 @_jliendro @OkcGhost @FieldYates Would be tough to beat KC but we could beat Houston. Not really impresâ€¦ https://t.co/pK2avVllxc"
#tw = clean_tweet_text_meta(test_tweet2)

In [13]:
'''
Function Description: 
'''
def page_search(twitter,query,text_file,raw_file):
    results = twitter.cursor(twitter.search,**query, return_pages=True)
    # search tweets
    tweets_dict = {'id':[],'created_at':[],'date':[],'time':[],'user':[],'text':[],'favorite_count':[], 'year':[], 'month':[], 'day_of_month':[], 'day_of_week':[]}
    tweets_text_metadata_dict = {'id':[],'date':[],'user':[],'urls':[],'hash_tags':[],'at_tags':[]}
    page_cnt = 0
    result_cnt = 0
    with io.open(f'{text_file}', 'a',encoding='utf8') as f:
        with io.open(f'{raw_file}','a',encoding='utf8') as r:
            try: 
                # page is a list of twitter results
                for i, page in enumerate(results):
                    page_cnt +=1
                    logger.info(f'Page: [{i}]')
                    try:
                        for j,result in enumerate(page):
                            #logger.info(f'result type:{type(result)}')
                            #break
                            
                            result_cnt += 1
                            logger.info(f'Result: [{j}]')
                            try:
                                try:
                                    logger.debug(f'{result["id_str"]} | {result["user"]["screen_name"]} | {result["created_at"]} | {result["text"]} | {result["user"]["favourites_count"]}')
                                except BaseException as be:
                                    logger.warning(f'page_search: ***WARNING***: Caught BaseException writing debug log file: {be}')
                                
                                # dump raw tweet to file as json
                                #raw_tweet = json.load(result)
                                dump = json.dumps(result)
                                r.write(dump)
                                r.write('\n')
                                
                                # dump tweet text to file
                                f.write(f'{result["id_str"]} {result["text"]}')
                                f.write('\n')
                                
                                # add key attributes to tweets dictionary as return results
                                tweets_dict['id'].append(result["id_str"])
                                tweets_dict['created_at'].append(result["created_at"])
                                tweets_dict['favorite_count'].append(result["user"]["favourites_count"])
                                        
                                # call function to parse string date
                                date_time = convert_str_date(result["created_at"]) # get datetime components
                                
                                tweets_dict['date'].append(date_time[0])
                                tweets_dict['time'].append(date_time[1])        
                                tweets_dict['user'].append(result["user"]["screen_name"])
                                
                                # call function to parse text for metadata
                                clean_text = clean_tweet_text_meta(result["text"])
                        
                                tweets_dict['text'].append(clean_text[0])
                                        
                                # create dictionary of tweet text metadata
                                tweets_text_metadata_dict['id'].append(result["id_str"])
                                tweets_text_metadata_dict['date'].append(date_time[0])
                                tweets_text_metadata_dict['user'].append(result["user"]["screen_name"])
                                tweets_text_metadata_dict['urls'].append(clean_text[1])
                                tweets_text_metadata_dict['hash_tags'].append(clean_text[2])
                                tweets_text_metadata_dict['at_tags'].append(clean_text[3])
                                
                                # track timeseries attributes for granular reporting and visualizations
                                tweets_dict['year'].append(date_time[2])
                                tweets_dict['month'].append(date_time[3])
                                tweets_dict['day_of_month'].append(date_time[4])
                                tweets_dict['day_of_week'].append(date_time[5])
                                       
                                #break

                            except BaseException as be:
                                logger.warning(f'**WARNING** Caught BaseException: {be}')
                                             
                    except BaseException as be:
                        logger.warning(f'**WARNING** Caught BaseException: {be}')
                    #break
            except BaseException as be:
                logger.warning(f'**WARNING** Caught BaseException: {be}')
    
    logger.info(f'page_search: processed page_cnt:[{page_cnt}] | total result_cnt: [{result_cnt}]')
    
    return pd.DataFrame.from_dict(tweets_dict), pd.DataFrame.from_dict(tweets_text_metadata_dict)
    

## Execute Twitter Search

[Twython 3.6.0 reference documentation](https://twython.readthedocs.io/en/latest/api.html)

In [14]:
# sets the search iteration for directory creation - this way we won't overwrite potentially useful data already created
# set this once, then comment out
search_iteration = 1

In [15]:
'''
Execute Twitter search by pre-configured date ranges
'''
search_iteration +=1 # each time this block of code is ran, the search iteration will update and create a new output directory structure
search_range_results_df = pd.DataFrame()
search_tweets_text_meta_df = pd.DataFrame()

# execute search by date ranges
for dates in search_date_ranges:
    search_range = f'{dates[0]}_{dates[1]}'
    logger.info(f'search_range: {search_range}')
    
    # output file names based on date range search
    outputPath = f'{dataDir}/{nfl_type}/{search_on}/v{search_iteration}/{search_range}'
    if not os.path.exists(outputPath): os.makedirs(outputPath)
        
    tweet_filename=f'{outputPath}/tweet_text.txt'
    raw_filename=f'{outputPath}/tweet_raw.txt'
        
    if not os.path.exists(f'{tweet_filename}'): open(f'{tweet_filename}', 'a').close()
    if not os.path.exists(f'{raw_filename}'): open(f'{raw_filename}', 'a').close()
    
    
    # configure query by dates
    query = config_query(filtered_search_terms,since=dates[0],until=dates[1],count=2)
    #break
    
    #------- EXECUTES TWITTER SEARCH -------------------#
    result_df = page_search(py_tweets,query,tweet_filename,raw_filename) 
    
    logger.debug(f'search result df: \n{result_df}')
    
    # merge dataframes - complete table of search results collected and written out to csv file in code block below
    search_range_results_df = search_range_results_df.append(result_df[0], ignore_index=True)
    search_tweets_text_meta_df = search_tweets_text_meta_df.append(result_df[1], ignore_index=True)

search_range: 2019-10-6_2019-10-13
Page: [0]
page_search: processed page_cnt:[1] | total result_cnt: [0]
search_range: 2019-10-13_2019-10-20
Page: [0]
page_search: processed page_cnt:[1] | total result_cnt: [0]
search_range: 2019-10-20_2019-10-27
Page: [0]
page_search: processed page_cnt:[1] | total result_cnt: [0]
search_range: 2019-10-27_2019-11-3
Page: [0]
page_search: processed page_cnt:[1] | total result_cnt: [0]
search_range: 2019-11-3_2019-11-10
Page: [0]
page_search: processed page_cnt:[1] | total result_cnt: [0]
search_range: 2019-11-10_2019-11-17
Page: [0]
page_search: processed page_cnt:[1] | total result_cnt: [0]
search_range: 2019-11-17_2019-11-24
Page: [0]
Result: [0]
tokens: ['The', '#Texans', 'have', 'signed', 'free', 'agent', 'DE', 'Joel', 'Heath', 'and', 'WR', 'Steven', 'Mitchell', 'Jr.', 'to', 'the', 'active', 'roster.', '\n\nHouston', 'placed…', 'https://t.co/lnCpdgpIjZ']
clean_tweet_text_meta: cleaned tokens: ['The', 'Texans', 'have', 'signed', 'free', 'agent', 'DE

--- Logging error ---
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\logging\__init__.py", line 1037, in emit
    stream.write(msg + self.terminator)
  File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 264-265: character maps to <undefined>
Call stack:
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\si

clean_tweet_text_meta: cleaned tokens: ['The', 'Texans', 'have', 'signed', 'NT', 'Eddie', 'Vanderdoes', 'to', 'the', 'active', 'roster', 'from', 'the', 'practice', 'squad.', 'He', 'is', 'No.', '95.', '\n\nHouston', 'p…']
Result: [1]
tokens: ['"It’s', 'an', 'excellent', 'football', 'team', 'and', 'that’s', 'why', 'they’re', '10-1,', 'and', 'it’s', 'a', 'big', 'challenge', 'for', 'us."\n\nWhat', 'the', 'Texans', 'ar…', 'https://t.co/9N1QubCyhi']
clean_tweet_text_meta: cleaned tokens: ['"It’s', 'an', 'excellent', 'football', 'team', 'and', 'that’s', 'why', 'they’re', '10-1,', 'and', 'it’s', 'a', 'big', 'challenge', 'for', 'us."\n\nWhat', 'the', 'Texans', 'ar…']
Page: [1]
Result: [0]
tokens: ['I’ve', 'spent', 'years', 'working', 'in', 'the', '#Houston', 'community,', 'so', 'I’m', 'aware', 'how', 'low', 'income', 'families', 'suffer.', 'I', 'will', 'strengthen…', 'https://t.co/noypWlSqWm']
clean_tweet_text_meta: cleaned tokens: ['I’ve', 'spent', 'years', 'working', 'in', 'the', 'Houston', '

--- Logging error ---
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\logging\__init__.py", line 1037, in emit
    stream.write(msg + self.terminator)
  File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f4ec' in position 74: character maps to <undefined>
Call stack:
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\rt310\AppData\Roaming\Python\Pyth

In [16]:
search_range_results_df.head()

Unnamed: 0,id,created_at,date,time,user,text,favorite_count,year,month,day_of_month,day_of_week
0,1198351595840389121,Sat Nov 23 21:24:19 +0000 2019,2019-11-23,21:24:19,TexansPR,The Texans have signed free agent DE Joel Heat...,63.0,2019,11,23,5
1,1198011151688372224,Fri Nov 22 22:51:31 +0000 2019,2019-11-22,22:51:31,AWalkerColts,Jonathan Williams’ 13-yard touchdown run in th...,9155.0,2019,11,22,4
2,1198009485375225858,Fri Nov 22 22:44:53 +0000 2019,2019-11-22,22:44:53,waemory,"Texans are like Sam Houston, not bought for we...",925.0,2019,11,22,4
3,1198008143642365952,Fri Nov 22 22:39:34 +0000 2019,2019-11-22,22:39:34,Felonies4Less,Do you need to speak with Houston Texas Crimin...,8227.0,2019,11,22,4
4,1198008131822981120,Fri Nov 22 22:39:31 +0000 2019,2019-11-22,22:39:31,Colts_TT,"Colts lose to Texans: The good, the bad and th...",0.0,2019,11,22,4


In [17]:
search_tweets_text_meta_df.head()

Unnamed: 0,id,date,user,urls,hash_tags,at_tags
0,1198351595840389121,2019-11-23,TexansPR,[https://t.co/lnCpdgpIjZ],[Texans],[]
1,1198011151688372224,2019-11-22,AWalkerColts,[https://t.co/YhxDyyheGC],[],[]
2,1198009485375225858,2019-11-22,waemory,[],[],[@TeamCornyn]
3,1198008143642365952,2019-11-22,Felonies4Less,"[https://t.co/0ac4DjePfY, https://t.co/qRzz2X7...","[Houston, Texas, Lawyer, Drug, Attorney, DWI]",[]
4,1198008131822981120,2019-11-22,Colts_TT,[https://t.co/mVOTcUngWm],[GoColts],[]


In [18]:
sorted_search_df = search_range_results_df.sort_values(by=['month','day_of_month','day_of_week'], ascending=True)
sorted_search_df.head(20)

Unnamed: 0,id,created_at,date,time,user,text,favorite_count,year,month,day_of_month,day_of_week
1,1198011151688372224,Fri Nov 22 22:51:31 +0000 2019,2019-11-22,22:51:31,AWalkerColts,Jonathan Williams’ 13-yard touchdown run in th...,9155.0,2019,11,22,4
2,1198009485375225858,Fri Nov 22 22:44:53 +0000 2019,2019-11-22,22:44:53,waemory,"Texans are like Sam Houston, not bought for we...",925.0,2019,11,22,4
3,1198008143642365952,Fri Nov 22 22:39:34 +0000 2019,2019-11-22,22:39:34,Felonies4Less,Do you need to speak with Houston Texas Crimin...,8227.0,2019,11,22,4
4,1198008131822981120,Fri Nov 22 22:39:31 +0000 2019,2019-11-22,22:39:31,Colts_TT,"Colts lose to Texans: The good, the bad and th...",0.0,2019,11,22,4
5,1198006766241505280,Fri Nov 22 22:34:05 +0000 2019,2019-11-22,22:34:05,TexansStuff,NFL Houston Texans Stance Socks Size L - New -,6.0,2019,11,22,4
6,1198005780819169281,Fri Nov 22 22:30:10 +0000 2019,2019-11-22,22:30:10,OTHeroics1,Houston Texans Take Control of the AFC South w...,10702.0,2019,11,22,4
7,1198003339352530945,Fri Nov 22 22:20:28 +0000 2019,2019-11-22,22:20:28,avpnews_live,Houston Texans’ DeAndre Hopkins celebrates wit...,1.0,2019,11,22,4
8,1198001802584764421,Fri Nov 22 22:14:22 +0000 2019,2019-11-22,22:14:22,ChronicleTexans,New Texans from Houston Chronicle — McClain: T...,1.0,2019,11,22,4
9,1198000912985407488,Fri Nov 22 22:10:50 +0000 2019,2019-11-22,22:10:50,Texans_TT,3 players the Houston Texans can sign after Dy...,945.0,2019,11,22,4
10,1198000822698889217,Fri Nov 22 22:10:28 +0000 2019,2019-11-22,22:10:28,ThatWhiteboy713,I'm sure your more of a texans hater being fro...,12212.0,2019,11,22,4


In [19]:
logger.info(f'search_range_results_df shape: {search_range_results_df.shape}')

search_range_results_df shape: (29, 11)


In [20]:
logger.info(f'x-rate-limit-remaining: [{py_tweets.get_lastfunction_header("x-rate-limit-remaining")}]')
logger.info(f'home_timeline: {py_tweets.get_home_timeline()}')          

x-rate-limit-remaining: [157]
home_timeline: []


## Save Full DataFrame of search results to csv

In [21]:
outputPath = f'{dataDir}/{nfl_type}/{search_on}/v{search_iteration}'
search_range_results_df.to_csv(f'{outputPath}/search_result_tweet_text_data.csv', index=False)
search_tweets_text_meta_df.to_csv(f'{outputPath}/search_result_tweet_text_meta.csv', index=False)