# Matchday Thread Analyzer

In [2]:
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from datetime import datetime
import nltk
from nltk import FreqDist
import praw
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
from sklearn.feature_extraction import text
import string


# Initiate reddit instance
reddit = praw.Reddit('match-day-bot', user_agent='match-day-bot user agent')

In [3]:
coys_matchday_thread = reddit.submission(
    url='https://www.reddit.com/r/coys/comments/8j3tx0/match_thread_spurs_v_leicester_pl_13_may_2018/')

In [4]:
coys_matchday_thread.comments.replace_more(limit=None)

matchday_comment_instances = [
    comment for comment in coys_matchday_thread.comments.list()]

### Collect match thread comments and comment metadata

In [5]:
author = [comment.author for comment in matchday_comment_instances]

body = [comment.body for comment in matchday_comment_instances]

karma = [comment.score for comment in matchday_comment_instances]

year = [datetime.utcfromtimestamp(
    comment.created_utc).year for comment in matchday_comment_instances]

month = [datetime.utcfromtimestamp(
    comment.created_utc).month for comment in matchday_comment_instances]

day = [datetime.utcfromtimestamp(
    comment.created_utc).day for comment in matchday_comment_instances]

hour = [datetime.utcfromtimestamp(
    comment.created_utc).hour for comment in matchday_comment_instances]

minute = [datetime.utcfromtimestamp(
    comment.created_utc).minute for comment in matchday_comment_instances]

### Place comment data into a Pandas dataframe

In [13]:
match_thread_data = {'username': author, 'comment': body, 'karma': karma,
                     'year': year, 'month': month, 'day': day, 'hour': hour, 'minute': minute}

df_match_thread = pd.DataFrame(data=match_thread_data)

df_match_thread = df_match_thread[[
    'username', 'comment', 'karma', 'year', 'month', 'day', 'hour', 'minute']]

#Remove deleted comments
remove_delated = df_match_thread[(df_match_thread['comment'] != '[deleted]')]

remove_deleted_row_indices = remove_delated.index

df_match_thread = df_match_thread.loc[remove_deleted_row_indices, :]

#Create Time column
df_match_thread['minute'] = df_match_thread['minute'].astype(str).str.zfill(2)

df_match_thread['time'] = df_match_thread['hour'].map(
    str) + '.' + df_match_thread['minute'].map(str)

df_match_thread['time'] = df_match_thread['time'].apply(pd.to_numeric)

#Only keep comments made roughly around the game start and finish times
game_time = df_match_thread[(df_match_thread['month'] == 5) & (df_match_thread['day'] == 13) & (df_match_thread['time'] <= 16.1)]

game_time_indices = game_time.index

df_match_thread = df_match_thread.loc[game_time_indices, :]

df_match_thread.sort_values(by=['time'], ascending=False).head()

Unnamed: 0,username,comment,karma,year,month,day,hour,minute,time
1485,ShinyJaker,"Right, not even a hat trick. Fucking scrub /s",1,2018,5,13,16,6,16.06
1327,RGuiscard,"I wanted us to sign him last year, he'll cost ...",2,2018,5,13,16,4,16.04
218,gcast91,if you didn’t rate KWP before you better get y...,8,2018,5,13,16,0,16.0
1826,arryatrick,I know. Most of us have been critical all day ...,1,2018,5,13,16,0,16.0
1449,AvJ164,Mahrez can dribble better,-2,2018,5,13,15,59,15.59


### Reshape data for time series analysis

In [33]:
df_comments_per_minute = df_match_thread.groupby(
    ['time']).size().reset_index(name='counts')

idx = df_match_thread.groupby(['time'])['karma'].transform(
    max) == df_match_thread['karma']

df_top_comments = df_match_thread.loc[idx, :]
                                      
df_top_comments = df_top_comments.sort_values(by=['time'])
                                      
df_comments_per_minute = df_comments_per_minute.merge(
    df_top_comments, on='time')

#Remove duplicate times
df_comments_per_minute = df_comments_per_minute.drop_duplicates('time')

In [37]:
from bokeh.models.annotations import BoxAnnotation, Label

x = df_comments_per_minute.time
y = df_comments_per_minute.counts

# Graph will be written in the following file
output_notebook()

# Generate graph
TOOLS = "pan,wheel_zoom,box_zoom,reset,save,box_select"

p = figure(title="Comments Per Minute", toolbar_location=None, tools='', plot_width=1200)

p.circle(x, y, legend="Comment Count")
p.line(x, y, legend="Comment Count")
p.xaxis.axis_label = "Time (UTC)"
p.xaxis.axis_label_text_color = "#aa6666"
p.xaxis.axis_label_standoff = 10
p.yaxis.axis_label = "Number of Comments"
p.yaxis.axis_label_text_color = "#aa6666"
p.yaxis.axis_label_standoff = 10

center = BoxAnnotation(top=56, bottom=-3, left=14.04, right=14.05, fill_alpha=0.25, fill_color='black')
p.add_layout(center)

label1 = Label(x=14.04, y=53, x_offset=12, text="Vardy '04: TOT 0-1 LEI", text_baseline="middle")
p.add_layout(label1)

show(p)

### Find the top 30 words used throughout the match thread

In [7]:
def text_lemmatize(text):
    """
    tokenize, lemmatize, and remove all punctuation from a string

    Example
    -------
    >>>lemmatize_text('The quick brown fox jumped over the lazy dog.')
    ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
    """
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    translator = str.maketrans('', '', string.punctuation)
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text.lower().translate(translator))]


stop_words = text.ENGLISH_STOP_WORDS


def word_count(dataframe, column, stop_words=stop_words):
    # Apply lemmatize_text function to each base column
    lemmatized_docs = text_lemmatize(dataframe[column].str.cat(sep=" "))
    # Only keep words with a length of greater than 3 characters and remove stop words
    lemmatized_docs = [w for w in lemmatized_docs if len(
        w) > 3 and w not in stop_words]
    # Generate word frequencies and order from greatest to least
    return FreqDist(lemmatized_docs).most_common()


word_count(df_match_thread, 'comment')[:30]

[('lamela', 145),
 ('game', 129),
 ('just', 117),
 ('fuck', 108),
 ('goal', 96),
 ('season', 95),
 ('kane', 94),
 ('like', 84),
 ('good', 83),
 ('fucking', 77),
 ('sissoko', 74),
 ('shit', 61),
 ('think', 56),
 ('need', 55),
 ('really', 51),
 ('dont', 48),
 ('player', 48),
 ('rose', 45),
 ('ball', 44),
 ('love', 44),
 ('match', 44),
 ('play', 43),
 ('time', 41),
 ('harry', 40),
 ('look', 40),
 ('commentator', 38),
 ('right', 37),
 ('wanyama', 37),
 ('want', 37),
 ('dier', 37)]

## Tottenham v Leicester started at 14:00 UTC
### Vardy kick started the match with an early goal at the 4 minute mark: TOT 0-1 LEI

In [57]:
pd.options.display.max_colwidth = 300


def comments_game_snapshot(dataframe, hour, minute_start, minute_end):
    game_snapshot_criteria = dataframe[(dataframe['hour'] == hour) & (
        dataframe['minute'] >= minute_start) & (dataframe['minute'] <= minute_end)]
    game_snapshot_row_indices = game_snapshot_criteria.index
    df_game_snapshot = dataframe.loc[game_snapshot_row_indices, :]
    df_game_snapshot = df_game_snapshot.sort_values(
        by=['karma'], ascending=False)
    return df_game_snapshot[['username', 'comment', 'karma']]


comments_game_snapshot(dataframe=df_match_thread,
                       hour=14,
                       minute_start=4,
                       minute_end=6)[:10]

Unnamed: 0,username,comment,karma
219,buttlovingpanda,KWP gives up the free kick then lets Vardy loose for the goal. Not a great start for him. Come on youngin!,12
138,mrocks301,Jesus we are shit on set pieces,8
374,charcoil23,KWP 100% at fault for that.,8
1401,gobucks2,Lamela floating in no man's land didn't exactly help him out...,6
386,fictional_pulp,That start was sub-optimal.,4
383,Zengoroth,Lol ffs,4
513,TheRcktMan,Not a good start,3
384,shaalth,KWP gave the free kick away and then lost Vardy for the header...,3
514,Bulky_Shepard,"Lamela left him alone, he was marking no one. KWP had a man behind him at least to defend",3
1675,Wurstie_Prurst,Pamela Anderson is the Best Mascot,3


### Kane answers with a 7th minute goal: TOT 1-1 LEI

In [9]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=14,
                       minute_start=7,
                       minute_end=11)[:10]

Unnamed: 0,username,comment,karma
140,macrowave2,Big boy Lucas making the interception/assist,9
139,Callum247,That’ll shut up all the miserable whiners in here.,8
141,Spursfan14,It’s just inevitable that he wins the golden boot isn’t it?,7
177,Keskekun,"He's coming for you, he's coming for yooooou. Tiny Egyptian Afroman he's coming for you",7
229,buttlovingpanda,Fucking Kane lol,6
230,alreadymilesaway,"I say it every game, but I fucking love Harry",6
231,SenorQuack,KWP looking solid in attack,6
289,Chroem-,Harry want the Golden boot bois!!!!,6
291,joeypickthall,THIS GAME IS GONNA BE FUCKING CRAZY,6
293,TitanCream,Lucas 2 Kane.\n\nKnew it.,6


### Mahrez scores at the 16th minute: TOT 1-2 LEI

In [10]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=14,
                       minute_start=16,
                       minute_end=20)[:10]

Unnamed: 0,username,comment,karma
27,TheGameIsAboutGlory1,"That goal is absolutely, 100% on Wanyama. No fucking idea what he's doing there, but he straight up does the dumbest shit at times. Can't blame ""playing through an injury"" on that garbage.",19
37,assassin_9729,Thank fuck we won on wednesday,15
282,Blazing_Frazer,I'll take a 4\-4 if Kane gets the golden boot tbh...,12
112,khj24,What the fuck is wanyama doing,10
142,TheGameIsAboutGlory1,"Fuck, might as well turn the match off now. ""No way this match is gonna end at 2-1."" Whenever commentators say shit like that after a fast start, the goals always stop.",8
179,mikezomfg,what the fuck without jan we are literally shambolic,8
180,highrouleur,Would we take 4-4 with Kane getting enough to beat salah to the Boot?,8
143,ZParis,"Damn Lucas, that touch was dirty.",8
235,GoldenSpurs,Vic and Mousa in midfield is just shithousery. Too clumsy no idea how and when to move the ball on.,7
238,elastic_fantastic,If we just worked on getting the ball to Lucas and Kane and having everybody else only concentrate on clearing the ball we could probably score another 5.,7


### Iheanacho scores at the 47th minute: TOT 1-3 LEI

In [11]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=15,
                       minute_start=5,
                       minute_end=8)[:10]

Unnamed: 0,username,comment,karma
99,Blazing_Frazer,Just have to laugh really,11
151,AnotherScoutMain,What the fuck is this game,8
150,Undercoverfootmodel,Good thing we are already in champions not having to play in sña mini playoff...,8
247,khj24,Nobody told the announcers at HT about the qualifying rounds?!?! And lol @ third consecutive third placed finish,7
496,stella__art,Why are we the way we are,7
248,akanefive,"Holy shit the announcer just said that if Spurs won it would be their third straight third place finish, and that it would mean they wouldn’t have to play a CL qualifier. FFS.",6
311,mrmunchkin62,Honestly eric what the fuck are you doing with that header,5
312,TehElk,Lmao this game,5
415,CruxMihiGrataQuies,Reminds me of a pre-Poch era match.,4
1280,ClassWarNowII,"One guy actually said ""Maybe Kane was a flash in the pan after all"" in a recent match thread. I was gobsmacked.",4


### Lamela responds with a goal at the 49th minute: TOT 2-3 LEI

In [12]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=15,
                       minute_start=9,
                       minute_end=12)[:10]

Unnamed: 0,username,comment,karma
38,strawberry_girls,Imagine this sub right now if last week’s game hadn’t ended the way it did,15
82,bterre108,"Commentators have been shit, but shoutout to the camera man finding that dude with two massive tubs of popcorn",12
83,tripstreet,lamela checked this forum at half time and says get fucked haters,11
115,ndphillips,I'm very sorry to report that u/WindyCOYS exploded with happiness after KWP's assist.,10
372,Jackalope117,eRiK lAmELa iSnT gOoD eNoUgH fOr Us,10
114,BrbnDrnkr,Lamela haters go away,8
152,tjakes12,What a ball from Lucas btw,8
153,Cool_Sandwich1,Moura is so good!,8
377,bterre108,Lucas just CREATES. Lad needs to play more,8
250,MaxMhad,Lamela read this fucking thread at half time and wanted to shut everyone up,7


### Fuchs own goal 53': TOT 3-3 LEI

In [13]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=15,
                       minute_start=13,
                       minute_end=17)[:10]

Unnamed: 0,username,comment,karma
40,PMYOUMYTITS,Lamela reading this thread at halftime.,15
101,warox13,"St. James' Park on the final day will get you, Chelsea.",10
285,TELLS_YOU_TO_FUCKOFF,ALL THE PEOPLE CHATTING SHIT ABOUT LAMELA JUST GOT SILENCED,10
84,HarryWanks,"""Lamela doesn't have the quality to play on an top 4 club""\n\n""Lamela is so shit what does he being to the team? ""\n\n",10
100,ndphillips,Coco's coming for salah,9
154,Tomisnthere,Lamelas desire is amazing,9
185,ClassWarNowII,Maybe we should insult our players more often.,8
186,MidFlightRiot,"6 goals 53 mins, and it's not being shown anywhere ffs",7
254,PMYOUMYTITS,COCO <3,6
255,pay_indigo,In awe of the size of that afro. Absolute unit.,6


### Lamela again 60': TOT 4-3 LEI

In [14]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=15,
                       minute_start=18,
                       minute_end=22)[:10]

Unnamed: 0,username,comment,karma
26,eric844,"Lamela shutting down the haters, including me. Gg coco",18
30,H2Pcoys,LOL CHELSEA\n\n3-0 to Newcastle,18
41,Thetonn,This wonderful clusterfuck is a lot more fun when it isn't deciding champions league football.,14
56,HoratioMG,LAMELAAAAAAA YOU BEAUTYYYY,12
224,bterre108,"r/coys: Lamela is shit, arguably should be off this team\n\nLamela: hat trick in 15 minues. \n\nWhat even is life?",12
65,TELLS_YOU_TO_FUCKOFF,"LADS, IT'S FUCKING TOTTENHAM",11
85,EdwinJamesPope,"Well, THAT'S why Poch doesn't check Reddit at half-time..",11
102,Jackalope117,Lamela really proved this sub wrong today,10
104,khj24,Lamela catching salah you heard it here first,10
103,warox13,HAHAHAAHAHAHAHAHAHAAH LAMELAAAAAAA,9


### Vardy's equalizer at 73' sandwiched in between Kane's "selfish" play and Kane winning the game: TOT 4-4 LEI

In [15]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=15,
                       minute_start=32,
                       minute_end=33)[:10]

Unnamed: 0,username,comment,karma
24,jetmora4,Kane it’s too late for the boot now you can’t be doing selfish shit like that,21
42,Xiomaraff,Wow classic ‘18 Spurs. 2 penalties not fucking given and then we let in a goal. Fuck the fuck off with this bullshit,16
44,MertBot,Just because it was an accident doesn't make it not a foul. Clear pen :/,14
67,khj24,That’s on Kane,13
68,alterego87,Oh Kane fuck off,12
89,NeonUprising,"Harry Kane was so selfish there, fuck",11
105,GhostofBobStoops,I feel like I'm watching a fucking Fifa game\n,10
124,albinuss,Nah what the fuck I want third place fuck,9
201,mcicchillo,2 clear penalties no calls,8
268,TheGameIsAboutGlory1,Really not surprised at all that Leicester scored after an absolute stonewall penalty.,6


### Kane scores the winner 76': TOT 5-4 LEI

In [16]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=15,
                       minute_start=34,
                       minute_end=38)[:10]

Unnamed: 0,username,comment,karma
12,Thetonn,"At this rate, Harry could still get the golden boot and at the same time Spurs still lose.",27
1655,PMYOUMYTITS,Poch wants to make sure Coco doesn't steal Kane's goals. No chance of that with Sissoko.,14
46,scottzander,DID YOU JUST SEE GAZZA DOING THE FORTNITE CELEBRATION,14
221,COYCOYS,"So I went to both drab Watford and Newcastle games, decided to skip this game and it's 5-4 ffffsssss hate myself",14
45,spurs-r-us,Tears in my eyes watching Gazza celebrate,14
57,scottzander,"Sanchez up front for the knock downs, good idea Poch, we need some more goals!",13
69,warox13,[This Sub Today](https://i.imgur.com/x2zNkuV.jpg),12
91,PMYOUMYTITS,Anyone have a gif of gazza doing the dance?,12
92,southcoastyid,Poch had enough. Sanchez now on.\n,12
107,H2Pcoys,America would be 100% in if every game was played like this,11


In [27]:
from IPython.core.display import display, HTML
from string import Template
import pandas as pd
import json, random

data = df_comments_per_minute

In [28]:
# Get the D3 host locally. 
HTML('<script src="./d3.min.js"></script>')

In [29]:
#HTML templet
html_template = Template('''
<style> $css_text </style>
<div id="graph-div"></div>
<script> $js_text </script>
''')

In [30]:
# Css templet
css_text = '''

.bar {
  fill: steelblue;}

.bar:hover {
  fill: brown;}

.axis {
  font: 10px sans-serif;}

.axis path,
.axis line {
  fill: none;
  stroke: #000;
  shape-rendering: crispEdges;}

.x.axis path {
  display: none;}

'''

In [31]:
#Java script templet
js_text_template = Template('''

var margin = {top: 20, right: 20, bottom: 30, left: 40},
    width = 500 - margin.left - margin.right,
    height = 300 - margin.top - margin.bottom;

var x = d3.scale.ordinal()
    .rangeRoundBands([0, width], .1);

var y = d3.scale.linear()
    .range([height, 0]);

var xAxis = d3.svg.axis()
    .scale(x)
    .orient("bottom");

var yAxis = d3.svg.axis()
    .scale(y)
    .orient("left");

var svg = d3.select("#graph-div").append("svg")
    .attr("width", width + margin.left + margin.right)
    .attr("height", height + margin.top + margin.bottom)
  .append("g")
    .attr("transform", "translate(" + margin.left + "," + margin.top + ")");

var data = $data ;

  x.domain(data.map(function(d) { return d.time; }));
  y.domain([0, d3.max(data, function(d) { return d.comment; })]);

  svg.append("g")
      .attr("class", "x axis")
      .attr("transform", "translate(0," + height + ")")
      .call(xAxis);

  svg.append("g")
      .attr("class", "y axis")
      .call(yAxis);

  svg.selectAll(".bar")
      .data(data)
    .enter().append("rect")
      .attr("class", "bar")
      .attr("x", function(d) { return x(d.letter); })
      .attr("width", x.rangeBand())
      .attr("y", function(d) { return y(d.y); })
      .attr("height", function(d) { return height - y(d.y); });

''')

In [32]:
js_text = js_text_template.substitute({'data': json.dumps(data.to_dict(orient='records'))})
HTML(html_template.substitute({'css_text': css_text, 'js_text': js_text}))

In [24]:
display(HTML("""
<!DOCTYPE html>
<svg width="960" height="500"></svg>
<script src="https://d3js.org/d3.v4.min.js"></script>
<script>

var svg = d3.select("svg"),
    margin = {top: 20, right: 20, bottom: 30, left: 50},
    width = +svg.attr("width") - margin.left - margin.right,
    height = +svg.attr("height") - margin.top - margin.bottom,
    g = svg.append("g").attr("transform", "translate(" + margin.left + "," + margin.top + ")");

var x = d3.scaleTime()
    .rangeRound([0, width]);

var y = d3.scaleLinear()
    .rangeRound([height, 0]);

var line = d3.line()
    .x(function(d) { return x(d.date); })
    .y(function(d) { return y(d.close); });

d3.csv("comments.csv", function(d) {
  d.date = d.time;
  d.close = d.counts;
  return d;
}, function(error, data) {
  if (error) throw error;

  x.domain(d3.extent(data, function(d) { return d.date; }));
  y.domain(d3.extent(data, function(d) { return d.close; }));

  g.append("g")
      .attr("transform", "translate(0," + height + ")")
      .call(d3.axisBottom(x))
    .select(".domain")
      .remove();

  g.append("g")
      .call(d3.axisLeft(y))
    .append("text")
      .attr("fill", "#000")
      .attr("transform", "rotate(-90)")
      .attr("y", 6)
      .attr("dy", "0.71em")
      .attr("text-anchor", "end")
      .text("Price ($)");

  g.append("path")
      .datum(data)
      .attr("fill", "none")
      .attr("stroke", "steelblue")
      .attr("stroke-linejoin", "round")
      .attr("stroke-linecap", "round")
      .attr("stroke-width", 1.5)
      .attr("d", line);
});

</script>
"""))