Author: Ryan Timbrook (RTIMBROO)  
DATE: 12/3/2019 <br>
Topic: 

## 1. Objective:
-----------------------------------------------------------------------------------------------------

In [None]:
import pandas as pd
import numpy as nb
import json
import os
from os import path
import fnmatch
import io
import re
import string
from datetime import date
from datetime import time
from datetime import datetime

In [None]:
# custome python packages
import rtimbroo_utils as br  

In [None]:
# set global properties
notebook_file_name = 'format_raw_twitter_data'
report_file_name = 'format_raw_twitter_data'
app_name = 'format_raw_twitter_data'
log_level = 10 # 10-DEBUG, 20-INFO, 30-WARNING, 40-ERROR, 50-CRITICAL

# setup working directory structure
# set global properties
dataDir = './data'
outputDir = './output'
configDir = './config'
logOutDir = './logs'
imageDir = './images'
modelDir = './models'
corpusDir = './corpus'
# create base output directories if they don't exist
if not os.path.exists(outputDir): os.mkdir(outputDir)
if not os.path.exists(logOutDir): os.mkdir(logOutDir)
if not os.path.exists(imageDir): os.mkdir(imageDir)
if not os.path.exists(modelDir): os.mkdir(modelDir)
if not os.path.exists(dataDir): os.mkdir(dataDir)
if not os.path.exists(configDir): os.mkdir(configDir)
if not os.path.exists(corpusDir): os.mkdir(corpusDir)

In [None]:
# get a logger for troubleshooting / data exploration
logger = br.getFileLogger(logOutDir+'/',app_name,level=log_level)

In [None]:
# set what to search on
nfl_type = 'player'
search_on = 'deshaun_watson'
# setup base twitter search query
search_terms="deshaun watson"

rawDataDir = "player/deshaun_watson"
tw_path = f'{dataDir}/{rawDataDir}'

In [None]:
# walk the directory structure pulling in json raw data to be parsed
# function to get the directory path location of corpus files to vectorize
def get_raw_tweets_by_version(path,version):
    _files = []
    
    path = f'{path}/v{version}'
    
    for dirpath, dirs, files in os.walk(path):
        logger.debug(f'{dirpath}')
        logger.debug(f'{dirs}')
        logger.debug(f'{files}')
        
        for d in dirs:
            for file in os.listdir(f'{dirpath}/{d}'):
                logger.debug(f'{dirpath}/{d}: files: {file}')
                if 'raw' in file:
                    _files.append(f'{dirpath}/{d}/{file}')
        break
    logger.info(f'_files: {_files}')
            
    logger.info(f'version:{version}, _files collected: {len(_files)}')
    
    
    return _files

In [None]:
tweetFiles = get_raw_tweets_by_version(tw_path,1)

In [None]:
def process_raw_tweet(result):
# search tweets
    tweets_dict = {'id':[],'created_at':[],'date':[],'time':[],'user':[],'text':[],'favorite_count':[], 'year':[], 'month':[], 'day_of_month':[], 'day_of_week':[]}
    tweets_text_metadata_dict = {'id':[],'date':[],'user':[],'urls':[],'hash_tags':[],'at_tags':[]}
   
    try: 
        
        try:
            logger.debug(f'{result["id_str"]} | {result["user"]["screen_name"]} | {result["created_at"]} | {result["text"]} | {result["user"]["favourites_count"]}')
        except BaseException as be:
            logger.warning(f'page_search: ***WARNING***: Caught BaseException writing debug log file: {be}')

        # if tweet_mode='extended', use _result['full_text']
        text = ''
        try:
            text = result['retweeted_status']["extended_tweet"]['full_text']
        except BaseException as be:
            logger.warning(f'page_search: NO full_text: {be}')
            text = result['text']

        # add key attributes to tweets dictionary as return results
        tweets_dict['id'].append(result["id_str"])
        tweets_dict['created_at'].append(result["created_at"])
        tweets_dict['favorite_count'].append(result["user"]["favourites_count"])

        # call function to parse string date
        date_time = br.convert_str_date(result["created_at"]) # get datetime components

        tweets_dict['date'].append(date_time[0])
        tweets_dict['time'].append(date_time[1])        
        tweets_dict['user'].append(result["user"]["screen_name"])

        # call function to parse text for metadata
        clean_text = br.clean_tweet_text_meta(logger, text)

        tweets_dict['text'].append(clean_text[0])

        # create dictionary of tweet text metadata
        tweets_text_metadata_dict['id'].append(result["id_str"])
        tweets_text_metadata_dict['date'].append(date_time[0])
        tweets_text_metadata_dict['user'].append(result["user"]["screen_name"])
        tweets_text_metadata_dict['urls'].append(clean_text[1])
        tweets_text_metadata_dict['hash_tags'].append(clean_text[2])
        tweets_text_metadata_dict['at_tags'].append(clean_text[3])

        # track timeseries attributes for granular reporting and visualizations
        tweets_dict['year'].append(date_time[2])
        tweets_dict['month'].append(date_time[3])
        tweets_dict['day_of_month'].append(date_time[4])
        tweets_dict['day_of_week'].append(date_time[5])

    except BaseException as be:
        logger.warning(f'**WARNING** Caught BaseException: {be}')
    
    logger.info(f'process_raw_tweet: completed processing...')
    
    return pd.DataFrame.from_dict(tweets_dict), pd.DataFrame.from_dict(tweets_text_metadata_dict)

In [None]:
#
search_range_results_df = pd.DataFrame()
search_tweets_text_meta_df = pd.DataFrame()
tweet_cnt = 0
result_df = None

for tweet_file in tweetFiles:
    with io.open(f'{tweet_file}', 'r',encoding='utf8') as f:
        for tweet in f.readlines():
            tweet_cnt +=1
            logger.info(f'reading tweet: {tweet_cnt}')
            logger.debug(f'tweet {tweet_cnt}:\n {tweet}')
            
            # convert string dictionary to dictionary
            tweet = json.loads(tweet)
            
            
            result_df = process_raw_tweet(tweet)
            
            # merge dataframes - complete table of search results collected and written out to csv file in code block below
            search_range_results_df = search_range_results_df.append(result_df[0], ignore_index=True)
            search_tweets_text_meta_df = search_tweets_text_meta_df.append(result_df[1], ignore_index=True)
            
            #break
    #break
    

logger.info(f'search_range_results_df shape: {search_range_results_df.shape} | head:\n{search_range_results_df.head()}')
logger.debug(f'{search_tweets_text_meta_df.head()}')

## SAVE DATA FRAME of TWEET TIMESERIES TEXT

In [None]:
outputPath = f'{dataDir}/{nfl_type}/{search_on}/v1'
search_range_results_df.to_csv(f'{outputPath}/search_result_tweet_text_data.csv', index=False)
search_tweets_text_meta_df.to_csv(f'{outputPath}/search_result_tweet_text_meta.csv', index=False)