# Matchday Thread Analyzer

In [12]:
from datetime import datetime
import nltk
from nltk import FreqDist
import praw
import pandas as pd
from sklearn.feature_extraction import text
import string


#Initiate reddit instance
reddit = praw.Reddit('match-day-bot', user_agent='match-day-bot user agent')

In [2]:
coys_matchday_thread = reddit.submission(url='https://www.reddit.com/r/coys/comments/8j3tx0/match_thread_spurs_v_leicester_pl_13_may_2018/')

In [3]:
coys_matchday_thread.comments.replace_more(limit=None)
matchday_comment_instances = [comment for comment in coys_matchday_thread.comments.list()]    

### Collect match thread comments and comment metadata

In [4]:
author = [comment.author for comment in matchday_comment_instances]
body = [comment.body for comment in matchday_comment_instances]
karma = [comment.score for comment in matchday_comment_instances]
year = [datetime.utcfromtimestamp(comment.created_utc).year for comment in matchday_comment_instances]
month = [datetime.utcfromtimestamp(comment.created_utc).month for comment in matchday_comment_instances]
day = [datetime.utcfromtimestamp(comment.created_utc).day for comment in matchday_comment_instances]
hour = [datetime.utcfromtimestamp(comment.created_utc).hour for comment in matchday_comment_instances]
minute = [datetime.utcfromtimestamp(comment.created_utc).minute for comment in matchday_comment_instances]

In [5]:
match_thread_data = {'username': author, 'comment': body, 'karma': karma, 'year': year, 'month': month, 'day': day, 'hour': hour, 'minute': minute}
df_match_thread = pd.DataFrame(data=match_thread_data)
df_match_thread = df_match_thread[['username', 'comment', 'karma', 'year', 'month', 'day', 'hour', 'minute']]

In [6]:
df_match_thread.head()

Unnamed: 0,username,comment,karma,year,month,day,hour,minute
0,akanefive,NBC announcer giving incorrect information abo...,66,2018,5,13,14,31
1,charcoil23,Dumb fuck announcers don't know that 4th place...,55,2018,5,13,14,31
2,a_magic_wizard,"""Toby is our best defender and the core of our...",43,2018,5,13,14,27
3,Keskekun,Sub off lamela and sissoko and bring on Keane ...,44,2018,5,13,14,41
4,TheGameIsAboutGlory1,I fucking hate how goalkeepers are a protected...,45,2018,5,13,14,48


In [7]:
def text_lemmatize(text):
    """
    tokenize, lemmatize, and remove all punctuation from a string
    
    Example
    -------
    >>>lemmatize_text('The quick brown fox jumped over the lazy dog.')
    ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
    """
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    translator=str.maketrans('','',string.punctuation)
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text.lower().translate(translator))]

# def text_join(text):
#     return ' '.join(text)

# df_match_thread['comment_lemmatized'] = df_match_thread['comment'].apply(text_lemmatize)
# df_match_thread['comment_join'] = df_match_thread['comment_lemmatized'].apply(text_join)
df_match_thread.head()

Unnamed: 0,username,comment,karma,year,month,day,hour,minute,comment_lemmatized
0,akanefive,NBC announcer giving incorrect information abo...,66,2018,5,13,14,31,"[nbc, announcer, giving, incorrect, informatio..."
1,charcoil23,Dumb fuck announcers don't know that 4th place...,55,2018,5,13,14,31,"[dumb, fuck, announcer, dont, know, that, 4th,..."
2,a_magic_wizard,"""Toby is our best defender and the core of our...",43,2018,5,13,14,27,"[toby, is, our, best, defender, and, the, core..."
3,Keskekun,Sub off lamela and sissoko and bring on Keane ...,44,2018,5,13,14,41,"[sub, off, lamela, and, sissoko, and, bring, o..."
4,TheGameIsAboutGlory1,I fucking hate how goalkeepers are a protected...,45,2018,5,13,14,48,"[i, fucking, hate, how, goalkeeper, are, a, pr..."


In [13]:
stop_words = text.ENGLISH_STOP_WORDS

def word_count(dataframe, column, stop_words=stop_words):
    #Apply lemmatize_text function to each base column
    lemmatized_docs = text_lemmatize(dataframe[column].str.cat(sep=" "))
    #Only keep words with a length of greater than 3 characters and remove stop words
    lemmatized_docs = [w for w in lemmatized_docs if len(w) > 3 and w not in stop_words]
    #Generate word frequencies and order from greatest to least
    return FreqDist(lemmatized_docs).most_common()

word_count(df_match_thread, 'comment')

[('lamela', 146),
 ('game', 129),
 ('just', 117),
 ('fuck', 108),
 ('goal', 96),
 ('season', 95),
 ('kane', 94),
 ('like', 84),
 ('good', 83),
 ('fucking', 77),
 ('sissoko', 74),
 ('shit', 61),
 ('think', 56),
 ('need', 55),
 ('really', 51),
 ('dont', 48),
 ('player', 48),
 ('rose', 45),
 ('ball', 44),
 ('love', 44),
 ('match', 44),
 ('play', 43),
 ('time', 41),
 ('harry', 40),
 ('look', 40),
 ('commentator', 38),
 ('right', 37),
 ('wanyama', 37),
 ('want', 37),
 ('dier', 37),
 ('half', 36),
 ('going', 35),
 ('year', 35),
 ('great', 34),
 ('coco', 33),
 ('thats', 33),
 ('today', 32),
 ('lucas', 32),
 ('make', 30),
 ('playing', 30),
 ('minute', 29),
 ('team', 29),
 ('score', 28),
 ('chelsea', 28),
 ('better', 28),
 ('yeah', 28),
 ('know', 27),
 ('toby', 27),
 ('place', 25),
 ('boot', 25),
 ('spur', 25),
 ('people', 25),
 ('finish', 25),
 ('said', 25),
 ('getting', 23),
 ('leicester', 23),
 ('come', 23),
 ('poch', 23),
 ('golden', 23),
 ('league', 23),
 ('salah', 23),
 ('best', 22),
 ('i

### Tottenham v Leicester started at 14:00 UTC
##### Vardy kick started the match with an early goal at the 4 minute mark

In [22]:
vardy_goal_1_criteria = df_match_thread[(df_match_thread['hour'] == 14) & (df_match_thread['minute'] >= 4) & (df_match_thread['minute'] <= 9)]
vardy_goal_1_row_indices = vardy_goal_1_criteria.index
df_vardy_goal_1 = df_match_thread.loc[vardy_goal_1_row_indices, :]
word_count(df_vardy_goal_1, 'comment')

[('just', 10),
 ('start', 7),
 ('harry', 7),
 ('golden', 6),
 ('boot', 6),
 ('game', 6),
 ('goal', 5),
 ('come', 5),
 ('fucking', 5),
 ('kane', 5),
 ('shit', 4),
 ('want', 4),
 ('vardy', 3),
 ('ball', 3),
 ('like', 3),
 ('bang', 3),
 ('that’s', 3),
 ('piece', 2),
 ('lucas', 2),
 ('it’s', 2),
 ('free', 2),
 ('kick', 2),
 ('great', 2),
 ('love', 2),
 ('finish', 2),
 ('gonna', 2),
 ('deleted', 2),
 ('really', 2),
 ('going', 2),
 ('lamela', 2),
 ('left', 2),
 ('marking', 2),
 ('thank', 2),
 ('leicester', 2),
 ('hope', 2),
 ('fuck', 2),
 ('hell', 2),
 ('ouch', 2),
 ('yeah', 2),
 ('spot', 2),
 ('looking', 2),
 ('calm', 2),
 ('season', 2),
 ('stream', 2),
 ('minute', 2),
 ('working', 2),
 ('doesnt', 2),
 ('look', 2),
 ('didnt', 2),
 ('money', 2),
 ('jesus', 1),
 ('that’ll', 1),
 ('shut', 1),
 ('miserable', 1),
 ('whiner', 1),
 ('making', 1),
 ('interceptionassist', 1),
 ('inevitable', 1),
 ('isn’t', 1),
 ('loose', 1),
 ('youngin', 1),
 ('bois', 1),
 ('thats', 1),
 ('proper', 1),
 ('crazy', 1)