In [16]:
import pandas as pd
import pprint
import urllib
import requests
pp = pprint.PrettyPrinter(indent=4)
from yanytapi import SearchAPI
from gensim.similarities.index import AnnoyIndexer
from gensim.models import Word2Vec
from mittens import GloVe, Mittens
from gensim.matutils import corpus2csc
from gensim.corpora import Dictionary
from collections import defaultdict
import numpy as np
import csv
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
#api = SearchAPI("TjGk9kxFO9ScvfSF8AfeqkXjjujBnz6e")

In [14]:
def process_strings(s):
    s = s.lower().replace(".", "").replace("'s","").replace("?","").replace("!","").replace(",", "").replace(";", "").replace("\"", "").replace("”", "").replace("“", "").replace("(", "").replace(")", "")
    if len(s) > 0 and s[0] == '$':
        return '$'
    return s
    
def csv_name(year):
    return 'articles-' + str(year) + '.csv'

def remove_waste(sentence):
    wasted_words = ['—', '&']
    return [word for word in sentence if word not in wasted_words]

def co_occurrence(df, window=5):
    print("co-occurrence")
    sentences = [remove_waste(list(map(lambda s : process_strings(s), p.split()))) for p in df['text']]
    d = dict()
    for sentence in sentences:
        for i in range(len(sentence)):
            if sentence[i] not in d:
                d[sentence[i]] = defaultdict(int)
            for j in range(-window, window):
                if i+j >= 0 and i+j < len(sentence) and i != j: 
                    d[sentence[i]][sentence[i+j]] += 1
    return d

def trim_d(d):
    print("trimming")
    vocab = list(d.keys())
    print(len(vocab))
    for word in d:
        if sum([v for k, v in dict(d[word]).items()]) < 100:
            vocab.remove(word)
    print(len(vocab))
    return {k:d[k] for k in vocab}

def d_to_matrix(d):
    print("matrixing")
    vocab = list(d.keys())
    matrix = np.zeros((len(vocab), len(vocab)))
    for i in range(len(vocab)):
        for j in range(len(vocab)):
            matrix[i][j] = d[vocab[i]][vocab[j]]
    return vocab, matrix

def generate_embeddings(df):
    d = co_occurrence(df)
    trimmed = trim_d(d)
    vocab, cooccurrence = d_to_matrix(d)
    glove_model = GloVe(n=25, max_iter=100)
    embeddings = glove_model.fit(cooccurrence)
    return vocab, embeddings

def glove2dict(glove_filename):
    with open(glove_filename, encoding="utf8") as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed



In [4]:
df = pd.concat([pd.read_csv(csv_name(year)) for year in [2019, 2018, 2017]], ignore_index=True, sort=False)
d = co_occurrence(df)
d = trim_d(d)
vocab, cooccurrence = d_to_matrix(d)

co-occurrence
trimming
90557
15798
matrixing


In [17]:
original_embedding = glove2dict('./glove.6B/glove.6B.50d.txt')
print("loaded original embeddings")


loaded original embeddings


Iteration 1000: error 48720.1756

NameError: name 'pickle' is not defined

In [None]:
mittens_model = Mittens(n=50, max_iter=1000)
# Note: n must match the original embedding dimension
new_embeddings = mittens_model.fit(
    cooccurrence,
    vocab=vocab,
    initial_embedding_dict=original_embedding)
np.savetxt('embeddings.csv', new_embeddings, delimiter=',')
with open("vocab.txt", "wb") as fp:
    pickle.dump(vocab, fp)

In [22]:
len(vocab)

15798

# 2010s

In [19]:
df = pd.concat([pd.read_csv(csv_name(year)) for year in [2019, 2018, 2017]], ignore_index=True, sort=False)
sentences = [list(map(lambda s : process_strings(s), p.split())) for p in df['text']]
model = Word2Vec(sentences)
print(model)
words = list(model.wv.vocab)
indexer = AnnoyIndexer(model, 2)
print(model.most_similar("marijuana", topn=7, indexer=indexer))
print(model.most_similar("cannabis", topn=7, indexer=indexer))
X = model.wv[model.wv.vocab]
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
plt.show()


Word2Vec(vocab=28697, size=100, alpha=0.025)
[('marijuana', 1.0), ('cannabis', 0.7156508564949036), ('drug:', 0.6136324107646942), ('recreational', 0.5932257771492004), ('hemp', 0.5360226631164551), ('tobacco', 0.5102111399173737), ('juul', 0.4959568381309509)]
[('cannabis', 1.0), ('marijuana', 0.715650886297226), ('hemp', 0.677280455827713), ('drug:', 0.6450612246990204), ('tobacco', 0.6384859085083008), ('e-cigarette', 0.6166323721408844), ('recreational', 0.5828753709793091)]


  import sys
  


# 2000s

In [23]:
df = pd.concat([pd.read_csv(csv_name(year)) for year in [2009, 2008, 2007]], ignore_index=True, sort=False)
d = co_occurrence(df)
d = trim_d(d)
vocab, cooccurrence = d_to_matrix(d)

co-occurrence
trimming
77634
13339
matrixing


In [24]:
mittens_model = Mittens(n=50, max_iter=1000)
# Note: n must match the original embedding dimension
new_embeddings = mittens_model.fit(
    cooccurrence,
    vocab=vocab,
    initial_embedding_dict=original_embedding)
np.savetxt('embeddings-00.csv', new_embeddings, delimiter=',')
with open("vocab-00.txt", "wb") as fp:
    pickle.dump(vocab, fp)

Iteration 1000: error 36949.1956

# 1990s

In [25]:
df = pd.concat([pd.read_csv(csv_name(year)) for year in [1999, 1998, 1997]], ignore_index=True, sort=False)
d = co_occurrence(df)
d = trim_d(d)
vocab, cooccurrence = d_to_matrix(d)

co-occurrence
trimming
59774
8958
matrixing


In [26]:
mittens_model = Mittens(n=50, max_iter=1000)
# Note: n must match the original embedding dimension
new_embeddings = mittens_model.fit(
    cooccurrence,
    vocab=vocab,
    initial_embedding_dict=original_embedding)
np.savetxt('embeddings-90.csv', new_embeddings, delimiter=',')
with open("vocab-90.txt", "wb") as fp:
    pickle.dump(vocab, fp)

Iteration 1000: error 18650.0542

# 1980s

In [27]:
df = pd.concat([pd.read_csv(csv_name(year)) for year in [1989, 1988, 1987]], ignore_index=True, sort=False)
d = co_occurrence(df)
d = trim_d(d)
vocab, cooccurrence = d_to_matrix(d)

co-occurrence
trimming
53220
7809
matrixing


In [28]:
mittens_model = Mittens(n=50, max_iter=1000)
# Note: n must match the original embedding dimension
new_embeddings = mittens_model.fit(
    cooccurrence,
    vocab=vocab,
    initial_embedding_dict=original_embedding)
np.savetxt('embeddings-80.csv', new_embeddings, delimiter=',')
with open("vocab-80.txt", "wb") as fp:
    pickle.dump(vocab, fp)

Iteration 1000: error 15288.0125