## Import libraries

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk.data
import nltk
import glob
import os
import re
import string
import pandas as pd
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

## Compile all .txt file data into a dataframe

In [2]:
# create a list of all the files I want to grab
cleantxt_list = glob.glob("Austin_Transcripts/*_c.txt")

# initiate a dictionary
text_dict = {}

# compile df - Jeremy magic
date_list, text_list = list(), list()
for filename in cleantxt_list:
    with open(filename, 'r') as f:
        data = f.read()
    date = filename.split("/")[1][:-6]
    for element in data.split('_'):
        text_list.append(element)
        date_list.append(date)

In [3]:
text_dict = {'date': date_list, 'text': text_list}
df = pd.DataFrame(text_dict)

In [4]:
df.head()

Unnamed: 0,date,text
0,2015-03-26,Mayor pro tem I had a que stion for Mr Mckinn...
1,2015-03-26,There are 20 that are signed up for the next ...
2,2015-03-26,Okay The other thing is Im going to do everyt...
3,2015-03-26,Thanks Hello My name is Jacquie benastante an...
4,2015-03-26,Good afternoon Three of us are here today and...


In [5]:
df.to_csv("speakers_df.csv")

In [6]:
# tokeninze text column
df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [7]:
df.head()

Unnamed: 0,date,text,tokenized_sents
0,2015-03-26,Mayor pro tem I had a que stion for Mr Mckinn...,"[Mayor, pro, tem, I, had, a, que, stion, for, ..."
1,2015-03-26,There are 20 that are signed up for the next ...,"[There, are, 20, that, are, signed, up, for, t..."
2,2015-03-26,Okay The other thing is Im going to do everyt...,"[Okay, The, other, thing, is, Im, going, to, d..."
3,2015-03-26,Thanks Hello My name is Jacquie benastante an...,"[Thanks, Hello, My, name, is, Jacquie, benasta..."
4,2015-03-26,Good afternoon Three of us are here today and...,"[Good, afternoon, Three, of, us, are, here, to..."


In [8]:
filename = 'GoogleNews-vectors-negative300.bin'
emb_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [10]:
def get_vectors(list): 
  #For word in list, get embedding, return all words in list"""
  vectors = []
  for word in list: 
    try:
      vector = emb_model[word]
      vectors.append(vector)

    except KeyError: 
      pass

  return vectors

In [11]:
df['vectors'] = df["tokenized_sents"].apply(get_vectors)

In [12]:
df.head()

Unnamed: 0,date,text,tokenized_sents,vectors
0,2015-03-26,Mayor pro tem I had a que stion for Mr Mckinn...,"[Mayor, pro, tem, I, had, a, que, stion, for, ...","[[-0.07763672, -0.12792969, 0.103515625, -0.33..."
1,2015-03-26,There are 20 that are signed up for the next ...,"[There, are, 20, that, are, signed, up, for, t...","[[-0.111328125, 0.14355469, 0.18945312, -0.161..."
2,2015-03-26,Okay The other thing is Im going to do everyt...,"[Okay, The, other, thing, is, Im, going, to, d...","[[0.09375, -0.07470703, 0.2109375, 0.22167969,..."
3,2015-03-26,Thanks Hello My name is Jacquie benastante an...,"[Thanks, Hello, My, name, is, Jacquie, benasta...","[[-0.3046875, 0.20703125, -0.00390625, 0.16503..."
4,2015-03-26,Good afternoon Three of us are here today and...,"[Good, afternoon, Three, of, us, are, here, to...","[[-0.10888672, -0.07470703, -0.045410156, -0.0..."


In [16]:
df["vectors"]

0       [[-0.07763672, -0.12792969, 0.103515625, -0.33...
1       [[-0.111328125, 0.14355469, 0.18945312, -0.161...
2       [[0.09375, -0.07470703, 0.2109375, 0.22167969,...
3       [[-0.3046875, 0.20703125, -0.00390625, 0.16503...
4       [[-0.10888672, -0.07470703, -0.045410156, -0.0...
5       [[-0.22558594, 0.13085938, -0.16308594, 0.1181...
6       [[-0.20605469, -0.12695312, 0.12890625, 0.2539...
7       [[-0.015380859, 0.17773438, -0.12597656, -0.09...
8       [[-0.109375, 0.11230469, 0.19140625, 0.078125,...
9       [[-0.20605469, -0.12695312, 0.12890625, 0.2539...
10      [[-0.20605469, -0.12695312, 0.12890625, 0.2539...
11      [[0.09375, -0.07470703, 0.2109375, 0.22167969,...
12      [[0.07861328, 0.13183594, 0.18945312, 0.314453...
13      [[-0.10888672, -0.07470703, -0.045410156, -0.0...
14      [[-0.1796875, 0.18652344, 0.00089645386, -0.06...
15      [[-0.07763672, -0.12792969, 0.103515625, -0.33...
16      [[-0.10888672, -0.07470703, -0.045410156, -0.0...
17      [[-0.1

In [None]:
df.to_csv("speakers_df.csv")