In [1]:
import pandas as pd
import pprint
import urllib
import requests
pp = pprint.PrettyPrinter(indent=4)
from yanytapi import SearchAPI
from gensim.similarities.index import AnnoyIndexer
from gensim.models import Word2Vec
from mittens import GloVe, Mittens
from gensim.matutils import corpus2csc
from gensim.corpora import Dictionary
from collections import defaultdict
import numpy as np
import csv
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import os
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from gensim.parsing.preprocessing import remove_stopwords
#api = SearchAPI("TjGk9kxFO9ScvfSF8AfeqkXjjujBnz6e")
% matplotlib inline
from gensim.test.utils import common_texts, get_tmpfile

UsageError: Line magic function `%` not found.


In [2]:
def get_df(years=None):
    if not years:
        return pd.concat([pd.read_csv('articles/' + file) for file in os.listdir('articles') if 'articles' in file], ignore_index=True, sort=False)
    else:
        return pd.concat([pd.read_csv('articles/' + csv_name(year)) for year in years], ignore_index=True, sort=False)

def get_sentence_list(years=None, only_weed=False):
    df = get_df(years=years)
    df['text'] = df['text'].map(process_strings)
    df['text'] = df['text'].map(remove_stopwords)
    if only_weed:
        paragraphs = [remove_waste(p.split()) for p in df['text'] if 'marijuana' in p or 'cannabis' in p]
    else:
        paragraphs = [remove_waste(p.split()) for p in df['text']]
    return paragraphs

def get_list(years=None, only_weed=False):
    return [item for sublist in get_sentence_list(years, only_weed) for item in sublist]

def get_sentence(years=None):
    split_it = get_list(years)
    return ' '.join(split_it)

def process_strings(s):
    s = s.lower()
    s = s.replace("’","'")
    s = s.replace("'s","")
    bad_chars = ".;:''?!,\[]”“()\""
    for char in bad_chars:
        s = s.replace(char, "")
    if len(s) > 0 and s[0] == '$':
        return '$'
    return s
    
def csv_name(year):
    return 'articles-' + str(year) + '.csv'

def remove_waste(sentence):
    wasted_words = ['—', '&', '-']
    return [word for word in sentence if word not in wasted_words]

def co_occurrence(df, window=5):
    print("co-occurrence")
    sentences = [remove_waste(list(map(lambda s : process_strings(s), p.split()))) for p in df['text']]
    d = dict()
    for sentence in sentences:
        for i in range(len(sentence)):
            if sentence[i] not in d:
                d[sentence[i]] = defaultdict(int)
            for j in range(-window, window):
                if i+j >= 0 and i+j < len(sentence) and i != j: 
                    d[sentence[i]][sentence[i+j]] += 1
    return d

def trim_d(d):
    print("trimming")
    vocab = list(d.keys())
    print(len(vocab))
    for word in d:
        if sum([v for k, v in dict(d[word]).items()]) < 100:
            vocab.remove(word)
    print(len(vocab))
    return {k:d[k] for k in vocab}

def d_to_matrix(d):
    print("matrixing")
    vocab = list(d.keys())
    matrix = np.zeros((len(vocab), len(vocab)))
    for i in range(len(vocab)):
        for j in range(len(vocab)):
            matrix[i][j] = d[vocab[i]][vocab[j]]
    return vocab, matrix

def generate_embeddings(df):
    d = co_occurrence(df)
    trimmed = trim_d(d)
    vocab, cooccurrence = d_to_matrix(d)
    glove_model = GloVe(n=25, max_iter=100)
    embeddings = glove_model.fit(cooccurrence)
    return vocab, embeddings

def glove2dict(glove_filename):
    with open(glove_filename, encoding="utf8") as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed


def display_closestwords_tsnescatterplot(model, word):
    
    arr = np.empty((0,100), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

def get_model_name(years):
    return "w2v_embeddings/%d-%d.model" % (years[0], years[-1])

def generate_w2v(years):
    path = "w2v_embeddings/%d-%d.model" % (years[0], years[-1])
    sentences = get_sentence_list(years=years)
    model = Word2Vec(sentences,min_count=10)
    model.save(path)

def display_highlights(model):
    if type(model) == range or type(model) == list:
        model = Word2Vec.load(get_model_name(model))
    words = list(model.wv.vocab)
    indexer = AnnoyIndexer(model, 2)
    print(model.wv.most_similar("marijuana", topn=7, indexer=indexer))
    print(model.wv.most_similar("cannabis", topn=7, indexer=indexer))
    display_closestwords_tsnescatterplot(model, "marijuana")

# 2010s

# Comprehensive

In [8]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = 'glove_embeddings/embeddings-00.csv'
tmp_file = get_tmpfile("test_word2vec.txt")
glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

In [10]:
display_highlights(model)

  index = AnnoyIndex(num_features)


KeyError: "word 'marijuana' not in vocabulary"