
CS 410/510: NLP Final Project
Students: Patrick Niyongabo, Robert Handy, Hanin Alshalan

## Soccer Events Summarization and Results Prediction

### 0. Introdution and Imports

In [1]:
import nltk
import matplotlib.pyplot as plot 
import collections

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1. Data Processing

In [3]:
import pandas as pd
import io

path = '/content/drive/MyDrive/ColabData/kaggle-world-cup-2018-tweets.csv'
# filename = 'kaggle-world-cup-2018-tweets.csv'
df = pd.read_csv(path)
print(df.shape[0])
print(type(df))

# df = pd.read_csv(io.StringIO(uploaded['kaggle-world-cup-2018-tweets.csv'].decode('utf-8')))
# df = pd.read_csv(io.StringIO('kaggle-world-cup-2018-tweets.csv').decode('utf-8')))
# df

530000
<class 'pandas.core.frame.DataFrame'>



1.1 Remove Low Quality Data

In [4]:
# Kinda just putting a couple filtering things here. It should probably get combined and cleaned up and inplace=True
# only_en = df.drop(df[df['lang'] != 'en'].index, inplace = False)
# longer_than_20 = df.drop(df[df['len'] < 20].index, inplace = False)

# stringify and lower case everything 
df["Tweet"] = df["Tweet"].str.lower()
print(df.shape[0])

# remove duplicate tweets
df.drop_duplicates(subset='Tweet', keep="last", inplace=True)
print(df.shape[0])

# filter out short tweets (less than 60 chars)
df['Tweet'] = df['Tweet'].astype('str')
dd = df[df['Tweet'].apply(lambda x: len(x)>60)]
print(dd.shape[0])

530000
113073
62779


In [5]:
# create dictionary mapping games to tweets based on timestamps & hashtags
import json
json_file = open('/content/drive/MyDrive/ColabData/cupfinals.json') 
world_cup_games = json.load(json_file) 
#print(world_cup_games["URUPOR"]["finalscore"])
#print(type(world_cup_games))
world_cup_games["URUPOR"]["tweets"] = ["test tweet1", "test tweet2"]
#print(world_cup_games["URUPOR"])

# create dictionary mapping tweets by day
tweets_by_date = {}
for index, row in dd.iterrows():
  gametime = row["Date"]
  gamedate = gametime.split(" ")[0]
  if gamedate in tweets_by_date:
    tweets_by_date[gamedate].append(row['Tweet'])
  else:
    tweets_by_date[gamedate] = [row['Tweet']]
print(len(tweets_by_date["2018-07-15"]))  

13357


In [6]:
# print(len(tweets_by_date["2018-07-06"]))  # KeyError
# print(len(tweets_by_date["2018-07-05"])) # KeyError
sum_of_tweets = 0
for key in tweets_by_date:  # there are missing dates. i.e.: no tweets in our data even though games happened
  print(key + " : "+ str(len(tweets_by_date[key])))
  sum_of_tweets += len(tweets_by_date[key])
print(sum_of_tweets)                      

2018-07-02 : 1011
2018-07-01 : 13831
2018-07-03 : 2243
2018-07-04 : 1946
2018-06-30 : 9353
2018-06-29 : 134
2018-07-10 : 11812
2018-07-11 : 9092
2018-07-15 : 13357
62779


In [7]:
# assign tweets to a game if date of tweet match date of game
# more filtering can be added. example: match hashtag in tweet with match hashtag
for game in world_cup_games:
  gametime = world_cup_games[game]["starttime"]
  gamedate = gametime.split(" ")[0]
  print(game, gamedate)
  if gamedate in tweets_by_date:
    world_cup_games[game]["tweets"] = tweets_by_date[gamedate]
# print(len(world_cup_games["FRACRO"]["tweets"]))
# print(len(world_cup_games["CROENG"]["tweets"]))
assert len(tweets_by_date["2018-07-15"]) == len(world_cup_games["FRACRO"]["tweets"])
assert len(tweets_by_date["2018-07-11"]) == len(world_cup_games["CROENG"]["tweets"])

URUPOR 2018-06-30
FRAARG 2018-06-30
ESPRUS 2018-07-01
CRODEN 2018-07-01
BRAMEX 2018-07-02
BELJAP 2018-07-02
SWESUI 2018-07-03
COLENG 2018-07-03
URUFRA 2018-07-06
BRABEL 2018-07-06
RUSCRO 2018-07-07
SWEENG 2018-07-07
FRABEL 2018-07-10
CROENG 2018-07-11
BELENG 2018-07-14
FRACRO 2018-07-15


In [8]:
all_FRACRO_tweets = ". ".join(world_cup_games["FRACRO"]["tweets"])
all_CROENG_tweets = ". ".join(world_cup_games["CROENG"]["tweets"])
all_BRAMEX_tweets = ". ".join(world_cup_games["BRAMEX"]["tweets"])
all_BELJAP_tweets = ". ".join(world_cup_games["BELJAP"]["tweets"])
all_SWESUI_tweets = ". ".join(world_cup_games["SWESUI"]["tweets"])
all_COLENG_tweets = ". ".join(world_cup_games["COLENG"]["tweets"])
all_BEFORE_tweets = ". ".join(tweets_by_date["2018-06-29"])
print("length of all_FRACRO_tweets = " + str(len(all_FRACRO_tweets)))
print("length of all_CROENG_tweets = " + str(len(all_CROENG_tweets)))
print("length of all_BRAMEX_tweets = " + str(len(all_BRAMEX_tweets)))
print("length of all_BELJAP_tweets = " + str(len(all_BELJAP_tweets)))
print("length of all_SWESUI_tweets = " + str(len(all_SWESUI_tweets)))
print("length of all_COLENG_tweets = " + str(len(all_COLENG_tweets)))
print("length of all_BEFORE_tweets = " + str(len(all_BEFORE_tweets)))

length of all_FRACRO_tweets = 1500569
length of all_CROENG_tweets = 969233
length of all_BRAMEX_tweets = 112269
length of all_BELJAP_tweets = 112269
length of all_SWESUI_tweets = 247862
length of all_COLENG_tweets = 247862
length of all_BEFORE_tweets = 13044


### 2. Events Summarization

In [9]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from collections import defaultdict 
from textblob import TextBlob

stopwords = list(STOP_WORDS)
nlp = spacy.load('en')
nlp.max_length = 1600000 # don't increase this or program will crash

def events_score(sent):
  score = 0
  if "score" in sent or "goal" in sent:
    score += 2
  if "red card" in sent or "penal" in sent or "pk" in sent:
    score += 2
  if "yellow card" in sent or "freekick" in sent or "booked" in sent or "booking" in sent:
    score += 2
  if "foul" in sent or "sub" in sent:
    score += 2
  if "half" in sent or "full" in sent or "over" in sent or "extra" in sent or "ht" in sent or "ft" in sent:
    score += 2
  return score 


# Place All As A Function For Reuseability
def text_summarizer(raw_docx):
    raw_text = raw_docx
    docx = nlp(raw_text)
    stopwords = list(STOP_WORDS)
    # Build Word Frequency
    # word.text is tokenization in spacy
    word_frequencies = defaultdict(int)  
    for word in docx:  
      if word.text not in stopwords:
        word_frequencies[word.text] += 1


    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    # Sentence Tokens
    sentence_list = [ sentence for sentence in docx.sents ]

    # Calculate Sentence Score and Ranking
    sentence_scores = defaultdict(int)
    for sent in sentence_list:
      sentence = sent.text.lower()
      if len(sentence.split(' ')) < 20 and abs(TextBlob(sentence).sentiment.polarity) < 0.5:
        sentence_scores[sent] = events_score(sentence) 
        for word in sent:
          if word.text.lower() in word_frequencies.keys():
            sentence_scores[sent] += word_frequencies[word.text.lower()]

    # Find N Largest
    summary_sentences = nlargest(5, sentence_scores, key=sentence_scores.get)
    final_sentences = [ w.text for w in summary_sentences ]
    summary = '\n'.join(final_sentences)
    #print("Original Document\n")
    #print(raw_docx)
    #print("Total Length:",len(raw_docx))
    #print('\n\nSummarized Document\n')
    #print("Summary Length:",len(summary))
    #print("Summary: " + summary)
    return summary

### 3. Summarization Results

In [10]:
# text_summarizer(all_BEFORE_tweets)
print("Summary of BRAMEX: " + text_summarizer(all_BRAMEX_tweets))
print("Summary of COLENG: " + text_summarizer(all_COLENG_tweets))
print("Summary of CROENG: " + text_summarizer(all_CROENG_tweets))
print("Summary of FRACRO: " + text_summarizer(all_FRACRO_tweets))

Summary of BRAMEX: croatia denmark on penalties ivan rakitic scores winning spot kick as croatia reach last eight.
beats champions in tense penalty shoot out with final scoreline of http.
goal jorgensen pokes in shot past subasic after scramble from throw in denmark lead inside mins here.
subasic is only the second goalkeeper to save three penalties in shootout at the.
fifa world cup beat in the penalty shoot out after suspenseful draw.
Summary of COLENG: england vs colombia eric dier says he needed to score winning penalty after missing chance in extra time.
goallll harry kane puts the penalty kick straight down the middle cool as cucumber england lead colombia kane wi.
only two england goalkeepers have saved penalty in shootout david seaman jordan pickford ht.
goal carlos sanchez fouls harry kane from corner and the captain takes care of the penalty em.
eric dier says he needed to score winning penalty after missing chance in extra ti.
Summary of CROENG: ffs can not handle extra time

In [11]:
sett = "that pernicious foul smell permeating through twitter is shitting himself after that goal three lions one."
TextBlob(sett).sentiment.polarity

0.0