In [1]:
# load libraries
import pandas as pd
import numpy as np
from gensim.models import word2vec
import logging
import nltk
from collections import Counter
import itertools
from nltk.corpus import stopwords

In [2]:
ls -lh data/

total 265M
-rw-rw-r-- 1 sam sam  15M Feb 21 18:47 nba.json
-rw-rw-r-- 1 sam sam 590K Feb 21 20:52 questions-words.txt
-rw-rw-r-- 1 sam sam 100M Feb 21 18:22 stream.json
-rw-rw-r-- 1 sam sam  96M Jun  9  2006 text8
-rw-rw-r-- 1 sam sam  55M Feb 21 18:47 trump.json


### load json data to dataframe

In [3]:
def load_data(file_name):
    print "Loading: " + file_name + " ..."
    data_path = './data/'
    data_df = pd.read_json(data_path + file_name, lines=True)
    # we only take the 'text' column
    drop_columns = list(data_df.columns)
    drop_columns.remove('text')
    data_df.drop(drop_columns, axis = 1, inplace = True)
    print "Done loading json file to dataframe."
    return data_df

df = load_data('stream.json')
df.dropna(axis=0, inplace=True) # drop na rows
df.head()

Loading: stream.json ...
Done loading json file to dataframe.


Unnamed: 0,text
0,RT @GerardAraud: No. Not punished. Normal end ...
1,@Nordstrom stock soars after @realDonaldTrump ...
2,RT @RitaCosby: #GoldStar Father &amp; #Democra...
3,"RT @Corporatocrazy: Topic at #DNCForum: ""How t..."
4,RT @Queen_UK: Kim Jong Un on the phone. Very u...


### pre-processing text

In [4]:
# remove https links
df['text'].replace(regex=True,inplace=True,to_replace=r'(http|https):\/\/[^(\s|\b)]+',value=r'')
# remove non-alphabet, this includes number and punctuation
df['text'].replace(regex=True,inplace=True,to_replace=r'[^a-zA-Z\s]',value=r'')
# tokenize each tweets to form sentences.
df['tokenized'] = df.apply(lambda row: nltk.word_tokenize(row['text'].lower()), axis=1)
# remove stop words
stop_words = stopwords.words('english')
add_stop_words = ['amp', 'rt']
stop_words += add_stop_words
print "sample stopping words: ", stop_words[:5]
df['tokenized'] = df['tokenized'].apply(lambda x: [item for item in x if item not in stop_words])
# we also need to remove 
df.head()

sample stopping words:  [u'i', u'me', u'my', u'myself', u'we']


Unnamed: 0,text,tokenized
0,RT GerardAraud No Not punished Normal end of m...,"[gerardaraud, punished, normal, end, term]"
1,Nordstrom stock soars after realDonaldTrump ba...,"[nordstrom, stock, soars, realdonaldtrump, bas..."
2,RT RitaCosby GoldStar Father amp DemocraticCon...,"[ritacosby, goldstar, father, democraticconven..."
3,RT Corporatocrazy Topic at DNCForum How to cre...,"[corporatocrazy, topic, dncforum, create, poor..."
4,RT QueenUK Kim Jong Un on the phone Very upset...,"[queenuk, kim, jong, un, phone, upset, hes, lo..."


In [5]:
# prepare the sentences used to train
sentences = list(df['tokenized'])
print sentences[1]

[u'nordstrom', u'stock', u'soars', u'realdonaldtrump', u'bashed', u'twitter', u'towards', u'five', u'year', u'low', u'trump', u'nordstrom']


In [6]:
# we can also look at the top 10 words
merged = list(itertools.chain(*sentences))
count_words = Counter(merged)
count_words.most_common(n=10)

[(u'trump', 10241),
 (u'nba', 3905),
 (u'maga', 1467),
 (u'trumps', 1157),
 (u'resist', 1097),
 (u'president', 1055),
 (u'superbowl', 942),
 (u'weather', 873),
 (u'obama', 833),
 (u'immigranttrump', 730)]

In [13]:
count_words['queen']

7

### Train word2vec

In [11]:
# train word2vec on the two sentences
model = word2vec.Word2Vec(sentences, size=100, min_count=1, workers=4)

In [16]:
model.most_similar(positive=['man', 'father'], negative=['woman'], topn=2)

[(u'son', 0.8967860341072083), (u'patrioticchoice', 0.8934655785560608)]

In [17]:
model['king']

array([-0.08358754, -0.04324149,  0.05222063,  0.13837481, -0.03145804,
        0.01891813, -0.07813929, -0.15625842,  0.16886249, -0.01989665,
       -0.09499143,  0.16616251, -0.29708767,  0.02101407,  0.10199653,
        0.52113682, -0.05090722, -0.21264745, -0.21206836, -0.17166524,
       -0.31643674, -0.04593456, -0.07890805, -0.05841589, -0.20822564,
       -0.01848609, -0.06034333, -0.05641041,  0.40205297,  0.02601048,
       -0.21787888, -0.19130263,  0.11345264,  0.06416356, -0.13391981,
       -0.01941937,  0.11347763, -0.02410239,  0.23810287,  0.05785607,
        0.27114791, -0.66712743,  0.00245408, -0.0384017 ,  0.09137112,
       -0.13361648, -0.30836245,  0.01839138,  0.06830075,  0.36589754,
       -0.19957888, -0.07206552,  0.18127915, -0.01365534, -0.04703069,
       -0.21607758,  0.30693969, -0.11687962, -0.21609247,  0.05312306,
       -0.13578808,  0.05873821,  0.22004935, -0.18714067, -0.0171869 ,
       -0.10642038, -0.07699807,  0.26924366, -0.08447391, -0.19

### Evaluate our model (this is very context dependent and no good way to objectively evaluate the result. Evaluation depends on your end application.

In [None]:
# model.accuracy('./data/questions-words.txt')