In [1]:
import pickle
import numpy as np
import re
from utils import clean
from gensim.models import Word2Vec
import codecs, json

In [2]:
import pandas as pd
from utils import clean, split_on_caps, TOP_100_PLAYERS

In [3]:
docs_raw_filename = "docs_raw_saturday2"
with open (docs_raw_filename, 'rb') as fp:
    docs = pickle.load(fp)
    
article_dicts_filename = "article_dicts_saturday2"
all_articles = []
with codecs.open(article_dicts_filename,'rU','utf-8') as f:
    for line in f:
        all_articles.append(json.loads(line))

In [4]:
from nltk import sent_tokenize #### rememeber this
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [5]:
def clean_sentence(sentence, title=False):
    if title:
        words = [word for word in sentence.split() if word.isalpha()]
    else:
        words = [word for word in word_tokenize(sentence) if word.isalpha()]
    
    words = [w.lower() for w in words]

    return " ".join(words)

In [6]:
with open("data/coaches.txt", "r") as fp:
    coaches = [team.replace("\n", "").strip().lower() for team in fp.readlines()]
    
with open("data/teams.txt", "r") as fp:
    teams = [team.replace("\n", "").strip().lower() for team in fp.readlines()]
    
df = pd.read_csv("data/cities_teams.csv", header=None)

cities = [v.lower() for v in df[0].values]
team_names = [v.lower().strip() for v in df[1].values]

players_first_names, players_last_names = zip(*[p.split() for p in TOP_100_PLAYERS])

name_dict = {}
for p in TOP_100_PLAYERS:
    name_dict[p.split()[0]] = p
    name_dict[p.split()[1]] = p

name_dict["james"] = "lebron james"

In [7]:
def clean_doc(text, title=False):
    text = clean_sentence(text, title)
    
    common = {"bron": "lebron_james", "kd": "kevin_durant", "steph": "stephen_curry", "russ": "russell_westbrook"}
    
    original_text = " ".join([word.lower() for word in text.split() if word.isalpha()])
    text = original_text # for modification
    
    for fn in players_first_names:
        wholename = name_dict[fn]
        if fn in text and wholename in original_text: # if first name and entire name is somewhere in text
            text = text.replace(wholename, "_".join(wholename.split())) # join wholename with underscore
            text = text.replace(" " + fn + " ", " " + "_".join(wholename.split()) + " ") #join the first name
    
    for ln in players_last_names:
        wholename = name_dict[ln]
        if ln in text and wholename in original_text: # if first name and entire name is somewhere in text
            text = text.replace(wholename, "_".join(wholename.split())) # join wholename with underscore
            text = text.replace(" " + ln + " ", " " + "_".join(wholename.split()) + " ") #join the first name

    for team in teams:
        if team in text:
            text = text.replace(" " + team + " ", " " + "_".join(team.split()) + " ")

    for city in cities:
        if city in text:
            text = text.replace(" " + city + " ", " " + "_".join(city.split()) + " ")

    for name in team_names:
        if name in text:
            text = text.replace(" " + name + " ", " " + "_".join(name.split()) + " ")
            
    for coach in coaches:
        last_name = coach.split()[1]
        if coach in text:
            text = text.replace(" " + coach + " ", " " + "_".join(coach.split()) + " ")
            text = text.replace(" " + last_name + " ", " " + "_".join(coach.split()) + " ")

    
    for acr, real in common.items():
        text = text.replace(" " + acr + " ", " " + real + " ")
    
    return text

In [8]:
%%time
all_sentences = []
for doc in docs:
    all_sentences += [clean_doc(sentence) for sentence in sent_tokenize(doc)]
    
all_sentences = [s.split() for s in all_sentences]
len(all_sentences)

CPU times: user 28.1 s, sys: 180 ms, total: 28.2 s
Wall time: 28.5 s


In [16]:
%%time
model100 = Word2Vec(size=100)
model100.build_vocab(all_sentences)
model100.train(all_sentences, total_examples=model.corpus_count, epochs=50)

CPU times: user 2min 53s, sys: 1.28 s, total: 2min 55s
Wall time: 1min 4s


In [17]:
model100.wv.doesnt_match('houston_rockets utah_jazz golden_state_warriors orlando_magic'.split())

'orlando_magic'

In [18]:
model100.wv.doesnt_match('james_harden chris_paul cappella stephen_curry'.split())

'stephen_curry'

In [19]:
model100.wv.most_similar('lebron_james')  

[('james', 0.7489138245582581),
 ('lebron', 0.6486578583717346),
 ('isaiah_thomas', 0.6015753149986267),
 ('kevin_love', 0.5609138011932373),
 ('kyrie_irving', 0.5215520262718201),
 ('cavaliers', 0.5125935077667236),
 ('cleveland', 0.5111081004142761),
 ('russell_westbrook', 0.5105723142623901),
 ('kanter', 0.508220374584198),
 ('stephen_curry', 0.5009845495223999)]

In [9]:
%%time
model = Word2Vec(size=30)
model.build_vocab(all_sentences)
model.train(all_sentences, total_examples=model.corpus_count, epochs=50)

CPU times: user 2min 34s, sys: 1.02 s, total: 2min 35s
Wall time: 56.3 s


In [13]:
model.wv.doesnt_match('houston_rockets utah_jazz golden_state_warriors orlando_magic'.split())

'orlando_magic'

In [15]:
model.wv.doesnt_match('james_harden chris_paul cappella stephen_curry'.split())

'stephen_curry'

In [10]:
s = model.wv.doesnt_match('thunder westbrook adams kevin_love'.split())
print(s)
model.wv.most_similar('lebron_james')  

kevin_love


[('james', 0.8426184058189392),
 ('lebron', 0.8130149245262146),
 ('kevin_love', 0.7719298601150513),
 ('kyrie_irving', 0.7388380169868469),
 ('isaiah_thomas', 0.711985170841217),
 ('cleveland', 0.7017042636871338),
 ('cleveland_cavaliers', 0.6924428939819336),
 ('nance', 0.6859879493713379),
 ('stephen_curry', 0.6836892366409302),
 ('chris_paul', 0.6620880365371704)]

In [72]:
titles = [clean_doc(article['title'], title=True) for article in all_articles if article['title']]
random_titles = np.random.choice(titles, 20)

In [83]:
def get_average(model, title):
    a = np.zeros(30)
    count = 0
    for word in title.split():
        try:
            a += model.wv[word]
            count += 1
        except:
            continue
    
    return a / count

vectorized_titles = [get_average(model, t) for t in titles]

In [149]:
def get_nearest(i, all_vectors):
    nearest = None
    dist = float('Inf')
    point = all_vectors[i]
    for j, v in enumerate(all_vectors):
        if np.linalg.norm(v - point) == 0:
            continue
        d = np.linalg.norm(v - point)
        if d < dist:
            dist = d
            nearest_index = j
    
    return nearest_index

def find_group_of_3_neighbors(word2vec_model, i1):
    i2 = get_nearest(i1, vectorized_titles)
    
    best_dist, i3 = float("Inf"), None
    for j,v in enumerate(vectorized_titles):
        if np.linalg.norm(vectorized_titles[j] - vectorized_titles[i1]) == 0 or np.linalg.norm(vectorized_titles[j] - vectorized_titles[i2]) == 0:
            continue
        d = np.linalg.norm(vectorized_titles[j] - vectorized_titles[i1])
        d = np.linalg.norm(vectorized_titles[j] - vectorized_titles[i2])
        if d < best_dist:
            best_dist = d
            i3 = j
                
    return [titles[i] for i in [i1, i2, i3]]

In [155]:
for n in np.random.choice(range(len(titles)), 20):
    print(find_group_of_3_neighbors(model, n))
    print()

['nfl draft odds and predictions for top teams to watch', 'nba mock projections and predictions for top prospects', 'nba mock draft projections and stock watch for top prospects']

['knicks coach jeff hornacek first shoved joakim noah during altercation', 'david david_fizdale reportedly turned down suns hc job before knicks hiring', 'david david_fizdale reportedly favorite for hornets hc job after steve clifford firing']

['derrick nba future is officially in jeopardy', 'giannis brother kostas reportedly declaring for nba draft', 'mo bamba working watching film with joel_embiid before nba draft']

['james_harden licked his lips after dropping wesley hitting', 'jordan poole was for ball before hitting houston', 'kyle kuzma says isaiah_thomas was at cavaliers bench after every score']

['nba playoff schedule known dates for each round of bracket', 'nba playoff schedule known dates for bracket', 'nba playoff schedule updated bracket guide and postseason dates']

['isaiah_thomas on scoring

In [None]:
# # persist the model to a disk
# model.save(fname)
# model = Word2Vec.load(fname)