## Import libraries

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk.data
import nltk
import glob
import os
import re
import string
import pandas as pd
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import numpy as np
from sklearn import preprocessing
from numpy import dot
from numpy.linalg import norm
from pyemd import emd
from sklearn.metrics.pairwise import cosine_similarity

## Compile all .txt file data into a dataframe

In [2]:
# create a list of all the files I want to grab
cleantxt_list = glob.glob("Austin_Transcripts/*_c.txt")

# initiate a dictionary
text_dict = {}

# compile df - Jeremy magic
date_list, text_list = list(), list()
for filename in cleantxt_list:
    with open(filename, 'r') as f:
        data = f.read()
    date = filename.split("/")[1][:-6]
    for element in data.split('_'):
        text_list.append(element)
        date_list.append(date)

In [3]:
text_dict = {'date': date_list, 'text': text_list}
df = pd.DataFrame(text_dict)

In [4]:
df.head()

Unnamed: 0,date,text
0,2015-03-26,Mayor pro tem I had a que stion for Mr Mckinn...
1,2015-03-26,There are 20 that are signed up for the next ...
2,2015-03-26,Okay The other thing is Im going to do everyt...
3,2015-03-26,Thanks Hello My name is Jacquie benastante an...
4,2015-03-26,Good afternoon Three of us are here today and...


In [5]:
#df.to_csv("speakers_df.csv")

In [5]:
# tokeninze text column
df['tokenized_text'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [11]:
df.head()

Unnamed: 0,date,text,tokenized_text
0,2015-03-26,Mayor pro tem I had a que stion for Mr Mckinn...,"[Mayor, pro, tem, I, had, a, que, stion, for, ..."
1,2015-03-26,There are 20 that are signed up for the next ...,"[There, are, 20, that, are, signed, up, for, t..."
2,2015-03-26,Okay The other thing is Im going to do everyt...,"[Okay, The, other, thing, is, Im, going, to, d..."
3,2015-03-26,Thanks Hello My name is Jacquie benastante an...,"[Thanks, Hello, My, name, is, Jacquie, benasta..."
4,2015-03-26,Good afternoon Three of us are here today and...,"[Good, afternoon, Three, of, us, are, here, to..."


### Vectorize Text

In [6]:
# Import pre-trained Google News model
filename = 'GoogleNews-vectors-negative300.bin'
emb_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [7]:
## Make a function that will convert lists to vectors
def get_vectors(list): 
  #For word in list, get embedding, return all words in list
  vectors = []
  for word in list: 
    try:
      vector = emb_model[word]
      vectors.append(vector)

    except KeyError: 
      pass

  return vectors


In [8]:
## Apply the function to the tokenized text, i.e., tokenized text to vectorized text
df['vectors'] = df["tokenized_text"].apply(get_vectors)

In [16]:
df.head()

Unnamed: 0,date,text,tokenized_text,vectors
0,2015-03-26,Mayor pro tem I had a que stion for Mr Mckinn...,"[Mayor, pro, tem, I, had, a, que, stion, for, ...","[[-0.07763672, -0.12792969, 0.103515625, -0.33..."
1,2015-03-26,There are 20 that are signed up for the next ...,"[There, are, 20, that, are, signed, up, for, t...","[[-0.111328125, 0.14355469, 0.18945312, -0.161..."
2,2015-03-26,Okay The other thing is Im going to do everyt...,"[Okay, The, other, thing, is, Im, going, to, d...","[[0.09375, -0.07470703, 0.2109375, 0.22167969,..."
3,2015-03-26,Thanks Hello My name is Jacquie benastante an...,"[Thanks, Hello, My, name, is, Jacquie, benasta...","[[-0.3046875, 0.20703125, -0.00390625, 0.16503..."
4,2015-03-26,Good afternoon Three of us are here today and...,"[Good, afternoon, Three, of, us, are, here, to...","[[-0.10888672, -0.07470703, -0.045410156, -0.0..."


### Normalize vectors
#### Above, we have one vector per word stored in an array. we need to average these so we have one vector per row/sentence/utterance

In [9]:
# Make a function to sum vectors
def sum_vectors(vector_list):
    return np.sum(vector_list, axis=0)

In [10]:
# Apply the function to column "vectors"
df['vector_sum'] = df["vectors"].apply(sum_vectors)

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.


In [11]:
# Make a function to count the length of the vector (roughly how many word in that sentence/utterance)
def list_length(vector_list):
    return len(vector_list)

In [12]:
# Apply the function to column vectors and make the output a column
df["vector_length"] = df["vectors"].apply(list_length)

In [13]:
# Get average by dividing sum by length
df["vector_average"] = df["vector_sum"]/df["vector_length"]

In [14]:
df.head()

Unnamed: 0,date,text,tokenized_text,vectors,vector_sum,vector_length,vector_average
0,2015-03-26,Mayor pro tem I had a que stion for Mr Mckinn...,"[Mayor, pro, tem, I, had, a, que, stion, for, ...","[[-0.07763672, -0.12792969, 0.103515625, -0.33...","[3.370697, 2.1581726, 2.4202385, 9.209576, -5....",100,"[0.03370697, 0.021581726, 0.024202384, 0.09209..."
1,2015-03-26,There are 20 that are signed up for the next ...,"[There, are, 20, that, are, signed, up, for, t...","[[-0.111328125, 0.14355469, 0.18945312, -0.161...","[2.2872353, 3.21389, 0.009590149, 9.508865, -8...",94,"[0.02433229, 0.03419032, 0.00010202286, 0.1011..."
2,2015-03-26,Okay The other thing is Im going to do everyt...,"[Okay, The, other, thing, is, Im, going, to, d...","[[0.09375, -0.07470703, 0.2109375, 0.22167969,...","[2.133606, 1.0718994, 1.0830688, 6.748413, -4....",51,"[0.041835412, 0.021017635, 0.021236643, 0.1323..."
3,2015-03-26,Thanks Hello My name is Jacquie benastante an...,"[Thanks, Hello, My, name, is, Jacquie, benasta...","[[-0.3046875, 0.20703125, -0.00390625, 0.16503...","[5.8538156, 13.34121, 12.888611, 41.340435, -3...",470,"[0.012454927, 0.028385554, 0.027422577, 0.0879..."
4,2015-03-26,Good afternoon Three of us are here today and...,"[Good, afternoon, Three, of, us, are, here, to...","[[-0.10888672, -0.07470703, -0.045410156, -0.0...","[-1.589881, 2.285658, 3.9089699, 5.8369904, -4...",84,"[-0.018927153, 0.027210213, 0.046535354, 0.069..."


In [15]:
type(df['vector_average'][0])

numpy.ndarray

In [20]:
def activistsdigest():
    # accept user input
    text = input("Write a topic of interest here: ")

    # clean and tokenize user input
    tokenized_input = nltk.word_tokenize(text)
    
    # get vectors for input text
    filename = 'GoogleNews-vectors-negative300.bin'
    emb_model = KeyedVectors.load_word2vec_format(filename, binary=True)
    vectorized_input = emb_model[tokenized_input]

    # calculate WMD distance
    input_veclist = []
    for i in df['vector_average']:
        distance = emb_model.wmdistance(i, vectorized_input)
        input_veclist.append(distance)

    # add input_veclist to df
    df['inputvectors'] = input_veclist

    # sort dataframe in ascending order by distance
    df.sort_values(by=['distance'])

    # return url for top 3 rows
    return print(df['text'].head(3))

In [21]:
activistsdigest()

Write a topic of interest here:  bike


TypeError: unhashable type: 'numpy.ndarray'

In [22]:
text = "bike"

# clean and tokenize user input
tokenized_input = nltk.word_tokenize(text)

tokenized_input

['bike']

In [24]:
vectorized_input = emb_model[tokenized_input]
#vectorized_input

In [25]:
# calculate WMD distance
input_veclist = []
for i in df['vector_average']:
    
    cos_sim = cosine_similarity([i], [vectorized_input])
    
    #cos_sim = dot(i, vectorized_input)/(norm(i)*norm(vectorized_input))
    input_veclist.append(cos_sim)

ValueError: Found array with dim 3. check_pairwise_arrays expected <= 2.

## Clean it up and pickle it

In [50]:
# Remove the process columns. Only need vector_average and the identifiers
df_clean = df.drop(['tokenized_text', 'vectors', 'vector_sum', 'vector_length'], axis=1)
#df_clean.head()
df_clean.to_csv("normalized_vectorized_speakers.csv")

In [51]:
df_clean.to_pickle("pickleddf.pkl")

In [52]:
emb_model.to_pickle("pickledmodel.pkl")

AttributeError: 'Word2VecKeyedVectors' object has no attribute 'to_pickle'