# Matchday Thread Analyzer

In [1]:
from datetime import datetime
import nltk
from nltk import FreqDist
import praw
import pandas as pd
from sklearn.feature_extraction import text
import string


#Initiate reddit instance
reddit = praw.Reddit('match-day-bot', user_agent='match-day-bot user agent')

In [2]:
coys_matchday_thread = reddit.submission(url='https://www.reddit.com/r/coys/comments/8j3tx0/match_thread_spurs_v_leicester_pl_13_may_2018/')

In [3]:
coys_matchday_thread.comments.replace_more(limit=None)
matchday_comment_instances = [comment for comment in coys_matchday_thread.comments.list()]    

### Collect match thread comments and comment metadata

In [4]:
author = [comment.author for comment in matchday_comment_instances]
body = [comment.body for comment in matchday_comment_instances]
karma = [comment.score for comment in matchday_comment_instances]
year = [datetime.utcfromtimestamp(comment.created_utc).year for comment in matchday_comment_instances]
month = [datetime.utcfromtimestamp(comment.created_utc).month for comment in matchday_comment_instances]
day = [datetime.utcfromtimestamp(comment.created_utc).day for comment in matchday_comment_instances]
hour = [datetime.utcfromtimestamp(comment.created_utc).hour for comment in matchday_comment_instances]
minute = [datetime.utcfromtimestamp(comment.created_utc).minute for comment in matchday_comment_instances]

In [5]:
match_thread_data = {'username': author, 'comment': body, 'karma': karma, 'year': year, 'month': month, 'day': day, 'hour': hour, 'minute': minute}
df_match_thread = pd.DataFrame(data=match_thread_data)
df_match_thread = df_match_thread[['username', 'comment', 'karma', 'year', 'month', 'day', 'hour', 'minute']]
remove_delated = df_match_thread[(df_match_thread['comment'] != '[deleted]')] 
remove_deleted_row_indices = remove_delated.index
df_match_thread = df_match_thread.loc[remove_deleted_row_indices, :]

In [27]:
df_match_thread.head()

Unnamed: 0,username,comment,karma,year,month,day,hour,minute
0,akanefive,NBC announcer giving incorrect information about the CL right now.,66,2018,5,13,14,31
1,charcoil23,Dumb fuck announcers don't know that 4th place no longer has to go through qualifying rounds. Imagine getting paid to be this bad at your job.,58,2018,5,13,14,31
2,a_magic_wizard,"""Toby is our best defender and the core of our defence.""\n\n*Toby starts, Leicester score 2 in 16 mins*\n\n""Our defence is shit without Jan""",46,2018,5,13,14,27
3,Keskekun,Sub off lamela and sissoko and bring on Keane and Berbatov,45,2018,5,13,14,41
4,TheGameIsAboutGlory1,I fucking hate how goalkeepers are a protected species. Kane just got a foul called against him for literally standing. That was the foul. Standing.,46,2018,5,13,14,48


### Find the top 30 words used throughout the match thread

In [29]:
def text_lemmatize(text):
    """
    tokenize, lemmatize, and remove all punctuation from a string
    
    Example
    -------
    >>>lemmatize_text('The quick brown fox jumped over the lazy dog.')
    ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
    """
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    translator=str.maketrans('','',string.punctuation)
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text.lower().translate(translator))]

stop_words = text.ENGLISH_STOP_WORDS
def word_count(dataframe, column, stop_words=stop_words):
    #Apply lemmatize_text function to each base column
    lemmatized_docs = text_lemmatize(dataframe[column].str.cat(sep=" "))
    #Only keep words with a length of greater than 3 characters and remove stop words
    lemmatized_docs = [w for w in lemmatized_docs if len(w) > 3 and w not in stop_words]
    #Generate word frequencies and order from greatest to least
    return FreqDist(lemmatized_docs).most_common()

word_count(df_match_thread, 'comment')[:30]

[('lamela', 146),
 ('game', 129),
 ('just', 117),
 ('fuck', 108),
 ('goal', 96),
 ('season', 95),
 ('kane', 94),
 ('like', 84),
 ('good', 83),
 ('fucking', 77),
 ('sissoko', 74),
 ('shit', 61),
 ('think', 56),
 ('need', 55),
 ('really', 51),
 ('dont', 48),
 ('player', 48),
 ('rose', 45),
 ('ball', 44),
 ('love', 44),
 ('match', 44),
 ('play', 43),
 ('time', 41),
 ('harry', 40),
 ('look', 40),
 ('commentator', 38),
 ('right', 37),
 ('wanyama', 37),
 ('want', 37),
 ('dier', 37)]

## Tottenham v Leicester started at 14:00 UTC
### Vardy kick started the match with an early goal at the 4 minute mark

In [30]:
pd.options.display.max_colwidth = 300

def comments_game_snapshot(dataframe, hour, minute_start, minute_end):
    game_snapshot_criteria = dataframe[(dataframe['hour'] == hour) & (dataframe['minute'] >= minute_start) & (dataframe['minute'] <= minute_end)]
    game_snapshot_row_indices = game_snapshot_criteria.index
    df_game_snapshot = dataframe.loc[game_snapshot_row_indices, :]
    df_game_snapshot = df_game_snapshot.sort_values(by=['karma'], ascending=False)
    return df_game_snapshot[['username','comment','karma']]
    
comments_game_snapshot(dataframe=df_match_thread,
                       hour=14,
                       minute_start=4,
                       minute_end=6)[:10]

Unnamed: 0,username,comment,karma
219,buttlovingpanda,KWP gives up the free kick then lets Vardy loose for the goal. Not a great start for him. Come on youngin!,13
1402,gobucks2,Lamela floating in no man's land didn't exactly help him out...,8
138,mrocks301,Jesus we are shit on set pieces,8
374,charcoil23,KWP 100% at fault for that.,8
383,Zengoroth,Lol ffs,3
384,shaalth,KWP gave the free kick away and then lost Vardy for the header...,3
386,fictional_pulp,That start was sub-optimal.,3
511,oysterpirate,Well that about sums up the last two months,3
512,FPnigel,and it begins...,3
513,TheRcktMan,Not a good start,3


### Kane answers with a 7th minute goal

In [25]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=14,
                       minute_start=7,
                       minute_end=11)[:10]

Unnamed: 0,username,comment,karma
139,Callum247,That’ll shut up all the miserable whiners in here.,8
141,Spursfan14,It’s just inevitable that he wins the golden boot isn’t it?,8
177,Keskekun,"He's coming for you, he's coming for yooooou. Tiny Egyptian Afroman he's coming for you",7
231,SenorQuack,KWP looking solid in attack,7
140,macrowave2,Big boy Lucas making the interception/assist,7
229,buttlovingpanda,Fucking Kane lol,6
230,alreadymilesaway,"I say it every game, but I fucking love Harry",6
290,tripstreet,"THATS A PROPER FINISH HARRY, GO ON",6
292,buttlovingpanda,Nifty ball that from KWP. Looked like a Trippier ball,6
293,TitanCream,Lucas 2 Kane.\n\nKnew it.,6


### Mahrez scores at the 16th minute

In [26]:
comments_game_snapshot(dataframe=df_match_thread,
                       hour=14,
                       minute_start=16,
                       minute_end=20)[:10]

Unnamed: 0,username,comment,karma
27,TheGameIsAboutGlory1,"That goal is absolutely, 100% on Wanyama. No fucking idea what he's doing there, but he straight up does the dumbest shit at times. Can't blame ""playing through an injury"" on that garbage.",18
37,assassin_9729,Thank fuck we won on wednesday,16
282,Blazing_Frazer,I'll take a 4\-4 if Kane gets the golden boot tbh...,12
112,khj24,What the fuck is wanyama doing,9
180,highrouleur,Would we take 4-4 with Kane getting enough to beat salah to the Boot?,8
143,ZParis,"Damn Lucas, that touch was dirty.",8
142,TheGameIsAboutGlory1,"Fuck, might as well turn the match off now. ""No way this match is gonna end at 2-1."" Whenever commentators say shit like that after a fast start, the goals always stop.",7
179,mikezomfg,what the fuck without jan we are literally shambolic,7
238,elastic_fantastic,If we just worked on getting the ball to Lucas and Kane and having everybody else only concentrate on clearing the ball we could probably score another 5.,6
239,gezmaestro,wow this improvised Alderweireld - Dier defence is not working out\n,6
