# Matchday Thread Analyzer

In [1]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models.annotations import Label
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.models.glyphs import VBar
from bokeh.models.formatters import DatetimeTickFormatter

from datetime import datetime
import nltk
import praw
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction import text
import string

In [2]:
# Initiate reddit instance
reddit = praw.Reddit('match-day-bot', user_agent='match-day-bot user agent')

coys_matchday_thread = reddit.submission(
    url='https://www.reddit.com/r/coys/comments/8j3tx0/match_thread_spurs_v_leicester_pl_13_may_2018/')

coys_matchday_thread.comments.replace_more(limit=None)

matchday_comment_instances = [
    comment for comment in coys_matchday_thread.comments.list()]

### Collect match thread comments and comment metadata

In [3]:
author = [comment.author for comment in matchday_comment_instances]

body = [comment.body for comment in matchday_comment_instances]

karma = [comment.score for comment in matchday_comment_instances]

utc = [datetime.utcfromtimestamp(
    comment.created_utc) for comment in matchday_comment_instances]

match_thread_data = {'username': author, 'comment': body, 'karma': karma, 'utc': utc}

df_match_thread = pd.DataFrame(data=match_thread_data)

df_match_thread = df_match_thread[['username', 'comment', 'karma', 'utc']]

df_match_thread['utc'] = pd.to_datetime(df_match_thread.utc, unit='h')

def col_strip_seconds(row):
    return row.replace(second=0)

df_match_thread.utc = df_match_thread.utc.apply(col_strip_seconds)

df_match_thread.head()

Unnamed: 0,username,comment,karma,utc
0,akanefive,NBC announcer giving incorrect information abo...,67,2018-05-13 14:31:00
1,charcoil23,Dumb fuck announcers don't know that 4th place...,56,2018-05-13 14:31:00
2,a_magic_wizard,"""Toby is our best defender and the core of our...",45,2018-05-13 14:27:00
3,Keskekun,Sub off lamela and sissoko and bring on Keane ...,46,2018-05-13 14:41:00
4,TheGameIsAboutGlory1,I fucking hate how goalkeepers are a protected...,47,2018-05-13 14:48:00


### Clean comment dataframe

In [4]:
# df_match_thread = pd.read_csv('tot_lei_match_thread_comments.csv')

# Remove deleted comments
remove_delated = df_match_thread[(df_match_thread['comment'] != '[deleted]')]

remove_deleted_row_indices = remove_delated.index

df_match_thread = df_match_thread.loc[remove_deleted_row_indices, :]

# Only keep comments made roughly around the game start and finish times
game_time = df_match_thread[(
    df_match_thread['utc'] >= '2018-05-13 14:00:00') & (df_match_thread['utc'] <= '2018-05-13 15:59:00')]

game_time_indices = game_time.index

df_match_thread = df_match_thread.loc[game_time_indices, :]

df_match_thread.sort_values(by=['utc'], ascending=False).head()

Unnamed: 0,username,comment,karma,utc
1826,RamonTico,"I agree, I'm not saying we should sell him or ...",1,2018-05-13 15:59:00
1447,AvJ164,Mahrez can dribble better,-2,2018-05-13 15:59:00
1411,Revalie,So close,1,2018-05-13 15:58:00
1685,DotEddie,I’m gonna save my banter till after they lose ...,3,2018-05-13 15:58:00
50,Barbzyy,Now fuck off Wembley,13,2018-05-13 15:58:00


In [5]:
df_match_thread = df_match_thread.sort_values(by=['utc'])

df_match_thread = df_match_thread.sort_values(
    by=['utc', 'karma'], ascending=False)

comment_1_list = []
comment_2_list = []
comment_3_list = []
comment_4_list = []
comment_5_list = []

times_list = df_match_thread['utc'].astype(str).unique().tolist()

# Grab the top five comments and usernames for each minute of the thread
for time in times_list:
    df = df_match_thread[df_match_thread['utc'] == time]
    top_comments = df['comment'][:5].tolist()
    top_usernames = df['username'][:5].tolist()
    top_karma = df['karma'][:5].tolist()
    # Find if there were less than five comments in that minute
    add_comments = [num for num in [1, 2, 3, 4, 5] if num > len(top_comments)]
    # If there were less than five comments in that minute, add blank entries in the list
    if len(top_comments) < 5:
        for comment in add_comments:
            top_comments.append('')
            top_usernames.append('')
            top_karma.append('')
    for (count, comment), username, karma in zip(enumerate(top_comments), top_usernames, top_karma):
        if count == 0:
            if comment != '':
                comment_1_list.append('@{0}: {1} [{2}]'.format(username, comment, karma))
            elif comment == '':
                comment_1_list.append(comment)
        elif count == 1:
            if comment != '':
                comment_2_list.append('@{0}: {1} [{2}]'.format(username, comment, karma))
            elif comment == '':
                comment_2_list.append(comment)
        elif count == 2:
            if comment != '':
                comment_3_list.append('@{0}: {1} [{2}]'.format(username, comment, karma))
            elif comment == '':
                comment_3_list.append(comment)
        elif count == 3:
            if comment != '':
                comment_4_list.append('@{0}: {1} [{2}]'.format(username, comment, karma))
            elif comment == '':
                comment_4_list.append(comment)
        elif count == 4:
            if comment != '':
                comment_5_list.append('@{0}: {1} [{2}]'.format(username, comment, karma))
            elif comment == '':
                comment_5_list.append(comment)

### Reshape data for time series analysis

In [6]:
df_comments_per_minute = df_match_thread.groupby(
    ['utc']).size().reset_index(name='counts')

idx = df_match_thread.groupby(['utc'])['karma'].transform(
    max) == df_match_thread['karma']

df_top_comments = df_match_thread.loc[idx, :]
                                      
df_top_comments = df_top_comments.sort_values(by=['utc'])
                                      
df_comments_per_minute = df_comments_per_minute.merge(
    df_top_comments, on='utc')

#Remove duplicate times
df_comments_per_minute = df_comments_per_minute.drop_duplicates('utc')

### Visualize Comment Data

In [23]:
# Graph will be written in the following file
output_notebook()

# Generate graph
p = figure(title="Match Thread Activity Trends", toolbar_location=None,
           plot_width=1500)

lei_goal_times = ['2018-05-13 14:04:00', '2018-05-13 14:16:00',
                  '2018-05-13 15:05:00', '2018-05-13 15:31:00']

# Goal Indicators
goal_source_lei = ColumnDataSource(
    data=dict(
        score_description=["Vardy '04: TOT 0-1 LEI", "Mahrez '16: TOT 1-2 LEI", 
                           "Iheanacho 47': TOT 1-3 LEI", "Vary 73': TOT 4-4 LEI"],
        x=[datetime.strptime(time, '%Y-%m-%d %H:%M:%S') for time in lei_goal_times], 
        y=[60, 60, 60, 60]
    )
)

leicester_gold = '#FBBA00'

goal_glyph_lei = VBar(x="x", top="y", bottom=0, width=0.001, fill_alpha=.25,
                      fill_color=leicester_gold, line_color=leicester_gold, line_alpha=.25)

goal_bar_lei = p.add_glyph(goal_source_lei, goal_glyph_lei)

goal_circle_lei = p.circle('x', 'y', source=goal_source_lei, size=15,
                           fill_alpha=1, fill_color="white", line_color=leicester_gold, line_alpha=.5, legend="Leicester Goal")

tot_goal_times = ['2018-05-13 14:07:00', '2018-05-13 15:07:00',
                  '2018-05-13 15:11:00', '2018-05-13 15:18:00', 
                  '2018-05-13 15:34:00']

goal_source_tot = ColumnDataSource(
    data=dict(
        score_description=["Kane '07: TOT 1-1 LEI", "Lamela 49': TOT 2-3 LEI",
                           "Fuchs 53' (og): TOT 3-3 LEI", "Lamela 60': TOT 4-3 LEI", 
                           "Kane 76': TOT 5-4 LEI"],
        x=[datetime.strptime(time, '%Y-%m-%d %H:%M:%S') for time in tot_goal_times],
        y=[60, 60, 60, 60, 60]
    )
)

spurs_blue = '#1C2542'

goal_glyph_tot = VBar(x="x", top="y", bottom=0, width=0.001, fill_alpha=.25,
                      fill_color=spurs_blue, line_color=spurs_blue, line_alpha=.25)

goal_bar_tot = p.add_glyph(goal_source_tot, goal_glyph_tot)

goal_circle_tot = p.circle('x', 'y', source=goal_source_tot, size=15, fill_alpha=1,
                           fill_color="white", line_color=spurs_blue, line_alpha=.5, legend="Tottenham Goal")

# Comment Lines

comment_source = ColumnDataSource(
    data=dict(
        comment_1=comment_1_list,
        comment_2=comment_2_list,
        comment_3=comment_3_list,
        comment_4=comment_4_list,
        comment_5=comment_5_list,
        counts=df_comments_per_minute.counts.tolist(),
        time=[datetime.strptime(time, '%Y-%m-%d %H:%M:%S') for time in times_list]
    )
)


comment_line = p.line(
    'time', 'counts', legend="Comment Count", source=comment_source)

comment_square = p.square(
    'time', 'counts', legend="Comment Count", source=comment_source, size=10, fill_color="white")


# Add Hover Tooltips
# p.add_tools(HoverTool(renderers=[comment_square], tooltips={
#             "1": "@comment_1", "2": "@comment_2", "3": "@comment_3", "4": "@comment_4", "5": "@comment_5"}))

p.add_tools(HoverTool(renderers=[comment_square], 
                      tooltips= """
                      <div class="comment_box" style="max-width: 600px">
                        <div><span style="font-size: medium">1: @comment_1</span></div>
                        <div><span style="font-size: medium">2: @comment_2</span></div>
                        <div><span style="font-size: medium">3: @comment_3</span></div>
                        <div><span style="font-size: medium">4: @comment_4</span></div>
                        <div><span style="font-size: medium">5: @comment_5</span></div>
                      </div>                   
                      """
                      ))

p.add_tools(HoverTool(renderers=[goal_circle_lei], tooltips={
            "Goal": "@score_description"}))

p.add_tools(HoverTool(renderers=[goal_circle_tot], tooltips={
            "Goal": "@score_description"}))


# Format Axis
p.xaxis.formatter = DatetimeTickFormatter(hourmin=['%H:%M'])

# Axis Labels
p.xaxis.axis_label = "Time (UTC)"
p.xaxis.axis_label_text_color = "#aa6666"
p.xaxis.axis_label_standoff = 10
p.yaxis.axis_label = "Number of Comments Per Minute"
p.yaxis.axis_label_text_color = "#aa6666"
p.yaxis.axis_label_standoff = 10

# Grid Lines
p.ygrid.grid_line_alpha = 0
p.xgrid.grid_line_alpha = 0

show(p)