# Game of Thrones example from class

In [None]:
import datetime as dt
import json
import os
import re
import string
from collections import Counter
from datetime import datetime, timezone

import nltk
import numpy as np
import pandas as pd
import praw
import requests
import spacy
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize
from seaborn import boxplot
from segtok.segmenter import split_single
from textblob import TextBlob
from tqdm import tqdm

# grab Post Game Threads from specific team

In [None]:
!pip3 install praw
!pip3 install flair

In [None]:


nltk.download('vader_lexicon')
nltk.download('punkt')

In [None]:

team_dict_for_url = {
    'DenverBroncos': 'den',
    'Patriots': 'nwe',
    'Browns': 'cle',
    'KansasCityChiefs': 'kan',
    'nyjets': 'nyj'
}


def wins_and_losses(team, year):
    # generate url
    team_abrev = team_dict_for_url[team]
    url = 'https://www.pro-football-reference.com/teams/' + \
        team_abrev + '/' + str(year) + '.htm#games::none'

    # pandas table
    season_stats = pd.read_html(url)[1]

    # gets wins and losses for that season in a list sequentially
    # if the team had a bye week, or the game has not happened yet (2020 season), 'nan' will be added
    win_loss_index = season_stats["Unnamed: 5_level_0"]["Unnamed: 5_level_1"]
    win_loss_list = []
    for i in range(17):
        win_loss_list.append(win_loss_index[i])

    # getting the dates for those games
    dates_to_index = season_stats["Unnamed: 2_level_0"]
    dates = dates_to_index["Date"]
    date_list = []
    formatted_date_list = []
    for i in range(17):
        non_formatted_date = str(dates[i]) + ", " + str(year)
        #date_list.append(str(dates[i])+ ", " + str(year))

        try:
            split = non_formatted_date.split()

            year = int(split[2])
            month = split[0]
            # increase year if 2020
            if month == "January":
                year += 1
            # replace
            split[2] = str(year)
            non_formatted_date = " ".join(split)

            formatted_date = datetime.strptime(non_formatted_date, "%B %d, %Y")
            # print(formatted_date)

            # convert to unix timestamp
            unix_timestamp = formatted_date.replace(
                tzinfo=timezone.utc).timestamp()
            # print(unix_timestamp)

            # print(formatted_date_list)
        except:
            unix_timestamp = "nan"
        formatted_date_list.append(unix_timestamp)

    date_win_loss_dict = {}
    for i in range(len(formatted_date_list)):
        date = formatted_date_list[i]
        outcome = win_loss_list[i]
        if outcome == "W":
            outcome_int = 1
        elif outcome == "L":
            outcome_int = 0
        else:
            outcome_int = -1
        date_win_loss_dict[date] = outcome_int
    return (date_win_loss_dict)


wins_and_losses("DenverBroncos", 2020)

In [None]:
def get_cumulative_post_sentiment(post, analyzer="vader"):
    total_comments = 0
    total_sentiment = {"pos": 0, "neg": 0, "neu": 0}
    sentiment_list = []
    post.comments.replace_more(limit=None)

    # Split into sentences, analyze each, average across
    text = "\n".join([comment.body for comment in post.comments])
    tokenized_sentences = sent_tokenize(text)
    for sentence in tqdm(tokenized_sentences):
        sentiment = get_sentiment(sentence, analyzer)
        total_sentiment["pos"] += sentiment["pos"]
        total_sentiment["neg"] += sentiment["neg"]
        total_sentiment["neu"] += sentiment["neu"]
        total_comments += 1
        sentiment_list.append(sentiment)
        # print(sentiment)
    for sentiment in total_sentiment.keys():
        total_sentiment[sentiment] /= total_comments
    return total_sentiment, sentiment_list


def get_sentiment(sentence, analyzer):
    if analyzer == "vader":
        model = SentimentIntensityAnalyzer()
        sentiment = model.polarity_scores(sentence)
    elif analyzer == "textblob":
        sentiment = {}
        text = TextBlob(sentence)
        polarity = (text.sentiment.polarity + 1) / 2
        sentiment["pos"] = polarity
        sentiment["neg"] = 1 - polarity
        sentiment["neu"] = 0
    elif analyzer == "flair":
        text = Sentence(sentence)
        # stacked_embeddings.embed(text)
        classifier.predict(text)
        value = text.labels[0].to_dict()['value']
        sentiment = {}
        result = round(text.to_dict()['labels'][0]['confidence'], 4)
        if value == 'POSITIVE':
            sentiment["pos"] = result
            sentiment["neg"] = 1 - result
        else:
            sentiment["pos"] = 1 - result
            sentiment["neg"] = result
        sentiment["neu"] = 0
        return sentiment

    return sentiment

In [None]:
reddit = praw.Reddit(client_id='Vrciy8Lso6cNXw',
                     client_secret='qR1_JCYeniK6iViLxk9RpAg0XN6DIQ', user_agent='Gimme Threads')
teamDict: {
    "broncos": "DenverBroncos"
}


def get_team_history(team, depth=5):
    team_history = []
    i = 0
    post_game_threads = reddit.subreddit(team).search("Post Game Thread")
    for post in tqdm(post_game_threads):
        if i < depth or depth < 0:
            total_sentiment, sentiment_list = get_cumulative_post_sentiment(
                post, analyzer="flair")
            game_summary = {
                "title": post.title,
                "reddit_date": dt.datetime.fromtimestamp(int(post.created)).replace(tzinfo=timezone.utc).timestamp(),
                "win": 0,
                "pos_sentiment": total_sentiment["pos"],
                "neg_sentiment": total_sentiment["neg"],
                "neu_sentiment": total_sentiment["neu"],
                "sentiment_list": sentiment_list
            }
            team_history.append(game_summary)
            i += 1

    return team_history


test = get_team_history("DenverBroncos", depth=3)

In [None]:
def add_win_loss_data(team_history, win_loss_data):
    for key, value in win_loss_data.items():
        for game in team_history:
            if key != "nan":
                if int(key) < game["reddit_date"] < int(key) + (48*60*60):
                    game["win"] = value


win_loss_data = wins_and_losses("DenverBroncos", 2020)

add_win_loss_data(test, win_loss_data)

for i in test:
    print(i)

In [None]:
neg_sentiment = [summary["neg_sentiment"] for summary in test]
pos_sentiment = [summary["pos_sentiment"] for summary in test]
wins = [summary["win"] for summary in test]


plt.plot(neg_sentiment)
plt.plot(pos_sentiment)
plt.plot(wins)
plt.title = "Sentiment Over Time"
plt.ylabel = "Score"
plt.xlabel = "Date"
plt.show()

In [None]:
# teams_list = ["DenverBroncos", "KansasCityChiefs", "nyjets", "Browns", "Patriots"]
teams_list = ["DenverBroncos", "KansasCityChiefs",
              "nyjets", "Browns", "Patriots"]
year_list = [2020, 2019, 2018]
teams_dict = {}

for team in teams_list:
    team_history = get_team_history(team, depth=16)  # Set depth to past games
    for year in year_list:
        win_loss_data = wins_and_losses(team, year)
        add_win_loss_data(team_history, win_loss_data)

    neg_sentiment = [summary["neg_sentiment"] for summary in team_history]
    pos_sentiment = [summary["pos_sentiment"] for summary in team_history]
    wins = [summary["win"] for summary in team_history]

    teams_dict[team] = {"pos": pos_sentiment,
                        "neg": neg_sentiment, "win": wins}

with open('team_histories_JSON.txt', 'w') as outfile:
    json.dump(teams_dict, outfile)

In [None]:
print(teams_dict["DenverBroncos"]["win"])

In [None]:
for team in teams_dict.keys():
    plt.plot(teams_dict[team]["pos"], label=team)

plt.title = "Sentiment Over Time"
plt.legend(loc='best')
plt.figure(figsize=(10, 10))
plt.show()

In [None]:
with open('team_histories_JSON_vicente.txt') as json_file:
    data = json.load(json_file)
    print(type(data))
    print(data.keys())
    print(data['DenverBroncos'].keys())
    print(data['DenverBroncos']['pos'])
    print(data['DenverBroncos']['neg'])
    print(data['DenverBroncos']['win'])
    print(data['DenverBroncos']['date'])
    print(data['DenverBroncos']['total_sentiment'])

In [None]:
with open('team_histories_JSON2.json') as json_file:
    data = json.load(json_file)

In [None]:
print(data['Patriots']['date'])
print(data['Patriots']['pos'])

In [None]:
x = sorttime('Patriots')
print(x)

In [None]:
data.keys()

In [None]:
# sort time


def sorttime(team):
    #sorttime = {}
   # for date in data[team]['date']:
    # sorttime[date]=data[team]['pos']
    res = {data[team]['date'][i]: data[team]['pos'][i]
           for i in range(len(data[team]['date']))}

    return res


for team in data.keys():
    sdict = sorttime(team)
    #print (sdict)
    x = (data[team]['date'])
    #print (x)
    while 0 in x:
        x.remove(0)
    print(x)
    #print (team,x)
    # for thing in x:
    #    print (type(thing))
    x = sorted(x)
    #print (team,x)
    y = []
    for date in x:
        y.append(sdict[date])
    plt.plot(y)
    #print (team,x,y)


plt.show()

In [None]:
def sorttime2(team):
    #sorttime = {}
   # for date in data[team]['date']:
    # sorttime[date]=data[team]['pos']
    win = data[team]["win"]
    while -1 in win:
        win.remove(-1)
    res = {data[team]['date'][i]: [data[team]['pos'][i], win[i]]
           for i in range(len(data[team]['date']))}

    return res


def createlineteam(team):
    final = []
    sdict = sorttime2(team)
    #print (sdict)
    x = (data[team]['date'])
    #print (x)
    while 0 in x:
        x.remove(0)
    print(x)
    #print (team,x)
    # for thing in x:
    #    print (type(thing))
    x = sorted(x)
    #print (team,x)
    y = []
    for date in x:
        y.append([date, sdict[date][0], sdict[date][1]])
    # plt.plot(y)
    #print (team,x,y)
    print(y)
    df = pd.DataFrame(y, columns=['date', 'score', 'win'])
    #df = df.pivot(index='date', columns='win', values='score')
    df.plot.scatter(x='date', y='score', c='win', colormap='viridis')
    plt.show()

In [None]:
#print (len(data['Patriots']['win']))
print(data['DenverBroncos']['win'])

#print (data['Patriots']['date'],data['Patriots']['win'])

createlineteam('DenverBroncos')

In [None]:
# sort time


def sorttime(team):
    #sorttime = {}
   # for date in data[team]['date']:
    # sorttime[date]=data[team]['pos']
    res = {data[team]['date'][i]: data[team]['pos'][i]
           for i in range(len(data[team]['date']))}

    return res


team_dates = []
team_sent = []
for team in data.keys():
    sdict = sorttime(team)
    for key in sdict.keys():
        item = [team, key, sdict[key]]
        team_dates.append(item)
        # team_dates.append(key)
    # for value in sdict.values():
        # team_sent.append(value)
df = pd.DataFrame(team_dates, columns=['team', 'date', 'score'])
df = df.pivot(index='date', columns='team', values='score')
print(df)
print(len(team_dates))
print(len(team_sent))
df.plot.line()


plt.show()


In [None]:
print(len(data['DenverBroncos']['total_sentiment']))

In [None]:
x = data['DenverBroncos']['total_sentiment'][0]
print((x))

In [None]:
x = data['DenverBroncos']['total_sentiment'][0][0]
print((x))

In [None]:
plt.figure(figsize=(10, 5), dpi=200)
box = []
date = data['Patriots']['total_sentiment']
x = data['Patriots']['date']
x = x[:5]
date = date[:5]
# for date in data['Patriots']['total_sentiment']:
for date in date:
    boxtemp = []
    for comment in date:
        boxtemp.append(comment['pos'])
    box.append(boxtemp)
plt.xlabel(sorted(x))
#plt.xticks([1, 2, 3,4,5], sorted(x))
plt.boxplot(box)


plt.show()

In [None]:

for team in data.keys():
    y = (data[team]['pos'])
    x = (data[team]['date'])
    winavg = 0
    wincount = 0
    losecount = 0
    loseavg = 0
    position = 0
    for win in data[team]['win']:
        if win == 1:
            wincount += 1
            winavg += data[team]['pos'][position]
            position += 1
        else:
            losecount += 1
            loseavg += data[team]['pos'][position]
            position += 1
    x2 = (winavg/wincount)
    x1 = (loseavg/losecount)
    plt.plot(x, y)
    plt.axhline(x1)
    plt.axhline(x2)
    plt.show()


In [None]:
for team in data.keys():
    winavg = 0
    wincount = 0
    losecount = 0
    loseavg = 0
    position = 0
    for win in data[team]['win']:
        if win == 1:
            wincount += 1
            winavg += data[team]['pos'][position]
            position += 1
        else:
            losecount += 1
            loseavg += data[team]['pos'][position]
            position += 1
    y = (winavg/wincount)
    x = (loseavg/losecount)
    print(y)
    print(x)

    plt.plot(y)

In [None]:
data[team]['pos']

In [None]:
data[team]['win']

In [None]:
teams_list = ["DenverBroncos"]

In [None]:
slist = []
for team in teams_list:
    team_history = get_team_history(team, depth=10)
    slist = summary["sentiment_list"] for summary in team_history
plot = DataFrame(slist)
#plot.columns = range(1,13)
plot.boxplot()
pyplot.show()

In [None]:
for team in teams_list:
    team_history = get_team_history(team, depth=10)
    text = [summary["text"] for summary in team_history]
    print(text)

In [None]:


def cleantext(text)


tokens = word_tokenize(text)
# convert to lower case
tokens = [w.lower() for w in tokens]
# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
return words

In [None]:
corpus = []
target = []
name = []
# for each character, add its list of words to the corpus as its faction to the target array
for team in teams_list:
    team_history = get_team_history(team, depth=10)
    for summary in team_history:
        text = summary["text"]
        text = cleantext(text)
        corpus.append(text)
        target.append(team)

In [None]:
totalVocabUse = {}
for word in corpus:
    if word not in totalVocabUse.keys():
        totalVocabUse[word] = 1
    else:
        totalVocabUse[word] += 1
# remove the words that are only used for one character
totalVocabUse = {word: count for word,
                 count in totalVocabUse.items() if count != 1}

# make the keys into a sorted list so it stays in order when we iterate through them
vocabList = list(totalVocabUse.keys())
vocabList.sort()

In [None]:
sentence_vectors = []
for sentence in corpus:
    sentence_tokens = cleantext(sentence)
    sent_vec = []
    for token in totalVocabUse:
        if token in sentence_tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    sentence_vectors.append(sent_vec)

In [None]:
# do TFIDF

In [None]:
def get_win_loss_sentiment(team):
    win_sentiment = []
    loss_sentiment = []
    avg_sentiment = []
    for i, outcome in enumerate(data[team]["win"]):
        if outcome == 1:
            win_sentiment.append(data[team]["pos"][i])
        elif outcome == 0:
            loss_sentiment.append(data[team]["pos"][i])
    avg_win_sent = sum(win_sentiment) / len(win_sentiment)
    avg_loss_sent = sum(loss_sentiment) / len(loss_sentiment)
    avg_total_sent = (sum(win_sentiment) + sum(loss_sentiment)) / \
        (len(win_sentiment) + len(loss_sentiment))
    return avg_win_sent, avg_loss_sent, avg_total_sent


team_name = []
team_data = []
for team in data.keys():
    win_sent, loss_sent, avg_sent = get_win_loss_sentiment(team)
    winlossavg = [win_sent, loss_sent, avg_sent]
    team_name.append(team)
    team_data.append(winlossavg)
df = pd.DataFrame(team_data, index=team_name)
df.columns = ['Win Avg', 'Lose Avg', 'Avg']
print(df)
df.plot.bar()

#plt.ylabel('Monetary Value')
# plt.gca().yaxis.set_major_formatter(ticker.FormatStrFormatter('$%.2f'))
# plt.gca().xaxis.set_tick_params(rotation=0)

In [None]:

groups = [[23, 135, 3], [123, 500, 1]]
group_labels = ['views', 'orders']

# Convert data to pandas DataFrame.
df = pd.DataFrame(groups, index=group_labels).T

# Plot.
pd.concat(
    [df.mean().rename('average'), df.min().rename('min'),
     df.max().rename('max')],
    axis=1).plot.bar()


In [None]:
(data['Patriots']["win"])

Link for learning how to iterate through comments: https://praw.readthedocs.io/en/stable/tutorials/comments.html