In [43]:
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer
import torch
from word_embeddings import words_to_embeddings, load_embeddings


In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## First we test to see if the functions in word_embeddings.py are working properly. We use a dummy example:

In [27]:
# Basic test to see if load_embeddings are working
embeddings = load_embeddings()
embeddings


{'a': array([1., 2., 3., 1.]),
 'b': array([1., 2., 3., 2.]),
 'c': array([1., 2., 3., 3.]),
 'd': array([1., 2., 3., 4.]),
 'e': array([1., 2., 3., 5.]),
 'f': array([1., 2., 3., 6.]),
 'g': array([1., 2., 3., 7.]),
 'h': array([1., 2., 3., 8.]),
 'i': array([1., 2., 3., 9.]),
 'j': array([ 1.,  2.,  3., 10.]),
 'k': array([ 1.,  2.,  3., 11.]),
 'l': array([ 1.,  2.,  3., 12.]),
 'm': array([ 1.,  2.,  3., 13.]),
 'n': array([ 1.,  2.,  3., 14.]),
 'o': array([ 1.,  2.,  3., 15.]),
 'p': array([ 1.,  2.,  3., 16.]),
 'q': array([ 1.,  2.,  3., 17.]),
 'r': array([ 1.,  2.,  3., 18.]),
 's': array([ 1.,  2.,  3., 19.]),
 't': array([ 1.,  2.,  3., 20.]),
 'u': array([ 1.,  2.,  3., 21.]),
 'v': array([ 1.,  2.,  3., 22.]),
 'w': array([ 1.,  2.,  3., 23.]),
 'x': array([ 1.,  2.,  3., 24.]),
 'y': array([ 1.,  2.,  3., 25.]),
 'z': array([ 1.,  2.,  3., 26.])}

In [28]:
words = ['a', 'c', 'd', 'g', 'i']
words_to_embeddings(words, embeddings)

[array([1., 2., 3., 1.]),
 array([1., 2., 3., 3.]),
 array([1., 2., 3., 4.]),
 array([1., 2., 3., 7.]),
 array([1., 2., 3., 9.])]

It appears that everything is working properly. Next we load the data.

In [56]:
# Load csv and remove columns with nan
df = pd.read_csv("post.csv")
df = df.dropna()
df.reset_index(inplace=True)
df

Unnamed: 0.1,index,Unnamed: 0,Title,Text,Upvotes,Downvotes,Subreddit,Source
0,9,9,Dear Professors...,My grandmother passed away one week ago. You k...,828,16,uofm,top
1,14,14,Farewell Ann Arbor,"As I sit here typically this, I have tears com...",732,14,uofm,top
2,17,17,HELLO MY FRIEND,"Stephanie, our favorite buddy from Pierpont Pa...",710,7,uofm,top
3,18,18,I've been teaching for almost a decade. This i...,Edit: Wow. Thank you everyone for all your tho...,703,7,uofm,top
4,20,20,how do i make the war in ukraine about me?,i'm not ukrainian and i don't have any friends...,734,63,uofm,top
...,...,...,...,...,...,...,...,...
1975,2968,2968,EECS 419 Syllabus,I’m thinking about adding EECS 419 to my sche...,3,2,uofm,new
1976,2969,2969,Winter FOMO,Hey Reddit! I am feeling pretty lonely again. ...,18,2,uofm,new
1977,2971,2971,Best spot around campus to smoke and chat on a...,A girl and I are going out for the first time ...,0,0,uofm,new
1978,2974,2974,Does someone have an invite on the Faves app?,I saw an add on Snapchat about the faves app a...,0,0,uofm,new


In [57]:
# concatenate title and text
df['Title'] = df['Title'].fillna('')
df['Text'] = df['Text'].fillna('')
df['Content'] = df['Title'] + ' ' + df['Text']
df = df.drop(['Title', 'Text'], axis=1)
df

Unnamed: 0.1,index,Unnamed: 0,Upvotes,Downvotes,Subreddit,Source,Content
0,9,9,828,16,uofm,top,Dear Professors... My grandmother passed away ...
1,14,14,732,14,uofm,top,Farewell Ann Arbor As I sit here typically thi...
2,17,17,710,7,uofm,top,"HELLO MY FRIEND Stephanie, our favorite buddy ..."
3,18,18,703,7,uofm,top,I've been teaching for almost a decade. This i...
4,20,20,734,63,uofm,top,how do i make the war in ukraine about me? i'm...
...,...,...,...,...,...,...,...
1975,2968,2968,3,2,uofm,new,EECS 419 Syllabus I’m thinking about adding E...
1976,2969,2969,18,2,uofm,new,Winter FOMO Hey Reddit! I am feeling pretty lo...
1977,2971,2971,0,0,uofm,new,Best spot around campus to smoke and chat on a...
1978,2974,2974,0,0,uofm,new,Does someone have an invite on the Faves app? ...


In [58]:
# convert to lowercase and tokenize text
tt = TweetTokenizer()
df['Content'] = df['Content'].map(lambda x : tt.tokenize(x.lower()))
df

Unnamed: 0.1,index,Unnamed: 0,Upvotes,Downvotes,Subreddit,Source,Content
0,9,9,828,16,uofm,top,"[dear, professors, ..., my, grandmother, passe..."
1,14,14,732,14,uofm,top,"[farewell, ann, arbor, as, i, sit, here, typic..."
2,17,17,710,7,uofm,top,"[hello, my, friend, stephanie, ,, our, favorit..."
3,18,18,703,7,uofm,top,"[i've, been, teaching, for, almost, a, decade,..."
4,20,20,734,63,uofm,top,"[how, do, i, make, the, war, in, ukraine, abou..."
...,...,...,...,...,...,...,...
1975,2968,2968,3,2,uofm,new,"[eecs, 419, syllabus, i, ’, m, thinking, about..."
1976,2969,2969,18,2,uofm,new,"[winter, fomo, hey, reddit, !, i, am, feeling,..."
1977,2971,2971,0,0,uofm,new,"[best, spot, around, campus, to, smoke, and, c..."
1978,2974,2974,0,0,uofm,new,"[does, someone, have, an, invite, on, the, fav..."


In [59]:
embeddings = load_embeddings("glove.6B.50d.txt")

In [60]:
df['Word_Embedding'] = df['Content'].map(lambda x : words_to_embeddings(x, embeddings))
df

Unnamed: 0.1,index,Unnamed: 0,Upvotes,Downvotes,Subreddit,Source,Content,Word_Embedding
0,9,9,828,16,uofm,top,"[dear, professors, ..., my, grandmother, passe...","[[-0.29946, 1.172, 0.3289, -0.74413, 1.0811, -..."
1,14,14,732,14,uofm,top,"[farewell, ann, arbor, as, i, sit, here, typic...","[[0.29064, 1.4114, -1.021, 0.044633, 0.30793, ..."
2,17,17,710,7,uofm,top,"[hello, my, friend, stephanie, ,, our, favorit...","[[-0.38497, 0.80092, 0.064106, -0.28355, -0.02..."
3,18,18,703,7,uofm,top,"[i've, been, teaching, for, almost, a, decade,...","[[0.92884, -0.72457, 0.068095, -0.3816, -0.038..."
4,20,20,734,63,uofm,top,"[how, do, i, make, the, war, in, ukraine, abou...","[[0.68938, -0.10644, 0.17083, -0.37583, 0.7517..."
...,...,...,...,...,...,...,...,...
1975,2968,2968,3,2,uofm,new,"[eecs, 419, syllabus, i, ’, m, thinking, about...","[[-0.99965, -1.1125, 0.1582, -0.32839, -1.0551..."
1976,2969,2969,18,2,uofm,new,"[winter, fomo, hey, reddit, !, i, am, feeling,...","[[-0.28054, 1.2422, -0.88838, -0.096132, -0.94..."
1977,2971,2971,0,0,uofm,new,"[best, spot, around, campus, to, smoke, and, c...","[[-0.91572, 0.60345, -0.31077, 0.28433, 0.5461..."
1978,2974,2974,0,0,uofm,new,"[does, someone, have, an, invite, on, the, fav...","[[0.2293, 0.34231, 0.059817, 0.083003, 0.57685..."
