Author: Ryan Timbrook (RTIMBROO)  
DATE: 12/3/2019 <br>
Topic: 

## 1. Objective:
-----------------------------------------------------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np
import json
import os
from os import path
import fnmatch
import io
import re
import string
from datetime import date
from datetime import time
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt #2D plotting
%matplotlib inline
#
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# custome python packages
import rtimbroo_utils as br  

In [3]:
# set global properties
notebook_file_name = 'nfl_sentiment_analysis_data_visualizations'
report_file_name = 'nfl_sentiment_analysis_data_visualizations'
app_name = 'nfl_sentiment_analysis_data_visualizations'
log_level = 10 # 10-DEBUG, 20-INFO, 30-WARNING, 40-ERROR, 50-CRITICAL

# setup working directory structure
# set global properties
dataDir = './data/train'
logOutDir = './logs'
imageDir = './images'
outputDir = './output'

# create base output directories if they don't exist
if not os.path.exists(logOutDir): os.mkdir(logOutDir)
if not os.path.exists(imageDir): os.mkdir(imageDir)
if not os.path.exists(dataDir): os.mkdir(dataDir)
if not os.path.exists(outputDir): os.mkdir(outputDir)

In [4]:
# get a logger for troubleshooting / data exploration
logger = br.getFileLogger(logOutDir+'/',app_name,level=log_level)

In [5]:
#train_nfl_master = pd.read_csv(f'{dataDir}/train_nfl_master.csv', encoding='latin')
sentiment_labeled_train_nfl_clean = pd.read_csv(f'{dataDir}/sentiment_labeled_train_nfl_clean.csv', encoding='latin')
nfl_tweets_master = pd.read_csv(f'{dataDir}/nfl_tweets_master.csv', encoding='latin')

In [6]:
nfl_tweets_master = nfl_tweets_master.sort_values(by=['id'], ascending=False).reset_index(drop=True)


logger.info(f'{nfl_tweets_master.shape}')
nfl_tweets_master.head()

(10148, 13)


Unnamed: 0,id,created_at,date,time,user,text,favorite_count,year,month,day_of_month,day_of_week,nfl_type,nfl_schedule_wk
0,1200925546487504897,Sat Nov 30 23:52:17 +0000 2019,2019-11-30,23:52:17,miamidolphin12,RT SEVENTEEN!,1398.0,2019,11,30,5,team,13
1,1200925257407827968,Sat Nov 30 23:51:08 +0000 2019,2019-11-30,23:51:08,JT_Evans97,You mean the same fan experts who could see La...,28503.0,2019,11,30,5,player,13
2,1200924548700495872,Sat Nov 30 23:48:19 +0000 2019,2019-11-30,23:48:19,MigiziLaFern,Deshaun Watson or Sam Darnold? WhoShouldIStart,161.0,2019,11,30,5,player,13
3,1200924401002303498,Sat Nov 30 23:47:43 +0000 2019,2019-11-30,23:47:43,DelindaTierney,"Since 2012, Tom Brady has won 7 straight games...",3148.0,2019,11,30,5,team,13
4,1200922963375280129,Sat Nov 30 23:42:01 +0000 2019,2019-11-30,23:42:01,TifdanyBrooks,RT Top 5 current NFL quarterbacks 1. Tom Bra...,10198.0,2019,11,30,5,player,13


In [7]:
sentiment_labeled_train_nfl_clean = sentiment_labeled_train_nfl_clean.sort_values(by='id', ascending=False).reset_index(drop=True)

logger.info(f'{sentiment_labeled_train_nfl_clean.shape}')
sentiment_labeled_train_nfl_clean.head()

(10148, 4)


Unnamed: 0,id,text,scores,sentiment
0,1200925546487504897,seventeen,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral
1,1200925257407827968,mean fan experts could see lamar jackson desha...,"{'neg': 0.0, 'neu': 0.584, 'pos': 0.416, 'comp...",positive
2,1200924548700495872,deshaun watson sam darnold whoshouldistart,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral
3,1200924401002303498,since tom brady straight games houston texans ...,"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'comp...",positive
4,1200922963375280129,top current nfl quarterbacks tom brady lamar j...,"{'neg': 0.0, 'neu': 0.878, 'pos': 0.122, 'comp...",positive


In [13]:
sentiment_labeled_train_nfl_clean = sentiment_labeled_train_nfl_clean.rename(columns={'text':'text_clean', 'id':'_id'})

In [14]:
logger.info(f'{sentiment_labeled_train_nfl_clean.shape}')
sentiment_labeled_train_nfl_clean.head()

(10148, 4)


Unnamed: 0,_id,text_clean,scores,sentiment
0,1200925546487504897,seventeen,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral
1,1200925257407827968,mean fan experts could see lamar jackson desha...,"{'neg': 0.0, 'neu': 0.584, 'pos': 0.416, 'comp...",positive
2,1200924548700495872,deshaun watson sam darnold whoshouldistart,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral
3,1200924401002303498,since tom brady straight games houston texans ...,"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'comp...",positive
4,1200922963375280129,top current nfl quarterbacks tom brady lamar j...,"{'neg': 0.0, 'neu': 0.878, 'pos': 0.122, 'comp...",positive


In [23]:
sentiment_labeled_train_nfl_clean[sentiment_labeled_train_nfl_clean.duplicated(['_id'],keep='last')]

dup_sentiment_labeled_train_nfl_clean = sentiment_labeled_train_nfl_clean[sentiment_labeled_train_nfl_clean.duplicated(['_id'],keep='last')]
logger.debug(f'dup_sentiment_labeled_train_nfl_clean: \n{dup_sentiment_labeled_train_nfl_clean}')

non_dup_sent_labeled = sentiment_labeled_train_nfl_clean.drop_duplicates(subset='_id', keep='first', inplace=False)

non_dup_sent_labeled[non_dup_sent_labeled.duplicated(['_id'])]

--- Logging error ---
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\logging\__init__.py", line 1037, in emit
    stream.write(msg + self.terminator)
  File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\x80' in position 3784: character maps to <undefined>
Call stack:
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\rt310\AppData\Roaming\Python\Python37

Unnamed: 0,_id,text_clean,scores,sentiment


In [24]:
nfl_tweets_master = nfl_tweets_master.rename(columns={'id':'_id'})
nfl_tweets_master[nfl_tweets_master.duplicated(['_id'],keep='last')]

dup_nfl_tweets_master = nfl_tweets_master[nfl_tweets_master.duplicated(['_id'],keep='last')]
logger.debug(f'dup_nfl_tweets_master: \n{dup_nfl_tweets_master}')

non_dup_nfl_tweets_master = nfl_tweets_master.drop_duplicates(subset='_id', keep='first', inplace=False)

non_dup_nfl_tweets_master[nfl_tweets_master.duplicated(['_id'])]


--- Logging error ---
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\logging\__init__.py", line 1037, in emit
    stream.write(msg + self.terminator)
  File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 7747-7748: character maps to <undefined>
Call stack:
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\rt310\AppData\Roaming\Python\Python37\

Unnamed: 0,_id,created_at,date,time,user,text,favorite_count,year,month,day_of_month,day_of_week,nfl_type,nfl_schedule_wk


In [25]:
logger.info(f'non_dup_sent_labeled.shape: {non_dup_sent_labeled.shape}')
logger.info(f'non_dup_nfl_tweets_master.shape: {non_dup_nfl_tweets_master.shape}')

non_dup_sent_labeled.shape: (9928, 4)
non_dup_nfl_tweets_master.shape: (9928, 13)


### Merge Data Sets

In [27]:
nfl_master_sent_merged = pd.DataFrame()

nfl_master_sent_merged = pd.merge(non_dup_nfl_tweets_master, non_dup_sent_labeled, on='_id', how='inner', sort=False,validate='one_to_one')

logger.info(f'{nfl_master_sent_merged.shape}')
nfl_master_sent_merged.head()

(9928, 16)


Unnamed: 0,_id,created_at,date,time,user,text,favorite_count,year,month,day_of_month,day_of_week,nfl_type,nfl_schedule_wk,text_clean,scores,sentiment
0,1200925546487504897,Sat Nov 30 23:52:17 +0000 2019,2019-11-30,23:52:17,miamidolphin12,RT SEVENTEEN!,1398.0,2019,11,30,5,team,13,seventeen,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral
1,1200925257407827968,Sat Nov 30 23:51:08 +0000 2019,2019-11-30,23:51:08,JT_Evans97,You mean the same fan experts who could see La...,28503.0,2019,11,30,5,player,13,mean fan experts could see lamar jackson desha...,"{'neg': 0.0, 'neu': 0.584, 'pos': 0.416, 'comp...",positive
2,1200924548700495872,Sat Nov 30 23:48:19 +0000 2019,2019-11-30,23:48:19,MigiziLaFern,Deshaun Watson or Sam Darnold? WhoShouldIStart,161.0,2019,11,30,5,player,13,deshaun watson sam darnold whoshouldistart,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral
3,1200924401002303498,Sat Nov 30 23:47:43 +0000 2019,2019-11-30,23:47:43,DelindaTierney,"Since 2012, Tom Brady has won 7 straight games...",3148.0,2019,11,30,5,team,13,since tom brady straight games houston texans ...,"{'neg': 0.0, 'neu': 0.863, 'pos': 0.137, 'comp...",positive
4,1200922963375280129,Sat Nov 30 23:42:01 +0000 2019,2019-11-30,23:42:01,TifdanyBrooks,RT Top 5 current NFL quarterbacks 1. Tom Bra...,10198.0,2019,11,30,5,player,13,top current nfl quarterbacks tom brady lamar j...,"{'neg': 0.0, 'neu': 0.878, 'pos': 0.122, 'comp...",positive


In [28]:
nfl_master_sent_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9928 entries, 0 to 9927
Data columns (total 16 columns):
_id                9928 non-null int64
created_at         9928 non-null object
date               9928 non-null object
time               9928 non-null object
user               9928 non-null object
text               9870 non-null object
favorite_count     9928 non-null float64
year               9928 non-null int64
month              9928 non-null int64
day_of_month       9928 non-null int64
day_of_week        9928 non-null int64
nfl_type           9928 non-null object
nfl_schedule_wk    9928 non-null int64
text_clean         9853 non-null object
scores             9928 non-null object
sentiment          9928 non-null object
dtypes: float64(1), int64(6), object(9)
memory usage: 1.3+ MB


In [33]:
# write merged dataframe to file
nfl_master_sent_merged.to_csv(f'{dataDir}/nfl_master_sent_merged_timeseries.csv', index=False)

del nfl_master_sent_merged
del non_dup_nfl_tweets_master
del non_dup_sent_labeled
del dup_nfl_tweets_master
del sentiment_labeled_train_nfl_clean
del nfl_tweets_master