In [12]:
from __future__ import division, print_function, absolute_import
from past.builtins import basestring

import os
import gzip

import pandas as pd

from twip.constant import DATA_PATH

from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [13]:
import matplotlib
from IPython.display import display, HTML 
%matplotlib inline
np = pd.np
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
pd.set_option('precision', 2)
%precision 4
%pprint

Pretty printing has been turned ON


So the new things are LsiModel and scatmat

In [14]:
from gensim.models import LsiModel
from twip.plot import scatmat

Load cleaned tweet data  
Don't forget to fix up the tokens!  
Can you think of a better way to save a list of lists of strings?
What about the raw, unprocessed unicode tweet text itself?

In [15]:
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    corpus = pd.DataFrame.from_csv(f, encoding='utf8')
corpus['tokens'] = corpus.txt.str.split()
vocab = Dictionary.from_documents(corpus.tokens)
corpus.tokens

87        [python, never, stop, learning, what, you, enj...
88                              [Watching, Boa, vs, Python]
90          [Monty, Python, The, silly, walk, via, YouTube]
                                ...                        
193375    [RT, RealPython, List, of, Python, API, Wrappe...
193376                          [Watching, Boa, vs, Python]
193377              [IT, Digital, Go, Senior, Python, Djan]
Name: tokens, dtype: object

Now load previously compiled vocabulary and TFIDF matrix (transformation)

In [16]:
tfidf = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf'))
tfidf.num_docs

183070

In [17]:
bows = pd.Series(vocab.doc2bow(toks) for toks in corpus.tokens)
bows

0         [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...
1                        [(8, 1), (9, 1), (10, 1), (11, 1)]
2         [(8, 1), (12, 1), (13, 1), (14, 1), (15, 1), (...
                                ...                        
183067    [(2, 1), (8, 1), (53, 1), (236, 1), (298, 2), ...
183068                   [(8, 1), (9, 1), (10, 1), (11, 1)]
183069    [(8, 1), (25, 1), (652, 1), (1669, 1), (13166,...
dtype: object

This would make a nice, compact sparse matrix representation of our entire corpus...  
Which would mean we could do more in RAM at once.  
Left as an exercise.  (check out `scipy.sparse.coo_matrix`)  

In [18]:
tfidf[bows[0]]

[(0, 0.5290),
 (1, 0.3151),
 (2, 0.0824),
 (3, 0.4033),
 (4, 0.4246),
 (5, 0.3819),
 (6, 0.2910),
 (7, 0.2014)]

In [19]:
dict([(vocab[i], freq) for i, freq in tfidf[bows[0]]])

{u'doing': 0.3819,
 u'enjoy': 0.5290,
 u'learning': 0.2910,
 u'never': 0.4033,
 u'python': 0.0824,
 u'stop': 0.4246,
 u'what': 0.3151,
 u'you': 0.2014}

Notice how "you" didn't get as much weight as "enjoy"  
Let's look at some other tweets  

This is starting to look a lot like a set of vectors that we could use as features  
But wait, if I used the IDs as the vector index (column) numbers, how many features or "columns" would I have?

In [None]:
len(vocab)

100k dimensions isn't a good idea  
Even for a masively parallel deep learning project this would be big  
Like the cat/dog picture classification on 256x256 images  
What about PCA (Principal Component Analysis) like is used on images?  
In NLP PCA is called LSI (Latent Semantic Analysis)  
That sounds cool!  
I want me some latent semantics (hidden meaning)  

In [None]:
lsi = LsiModel(bows, num_topics=100, id2word=vocab, extra_samples=100, power_iters=2)
lsi

## That's Fast!  
What happened to the **GIL**?  
The gilectomy talk isn't until tomorrow!  
Can Python do that?  
With `numpy` and `gensim` it can.  

What's that sound I hear?  
That's the sound mof your fans blowing *hot air* out of those tweets!  
(check out your system monitor or `htop`)  

In [28]:
tweetids = pd.Series(range(6), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
pd.DataFrame([pd.Series([x[1] for x in lsi[bows[i]]], index=topicids,
                        name='tweet') for i in tweetids],
             index=tweetids)


topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
0,0.28,0.65,-0.16,-0.33,0.02,-0.0145,0.48,-0.15,0.24,0.06,-0.12,-0.1,-0.14,0.124,0.19,0.00288,0.07,-0.0261,0.25,0.69,0.421,0.1,0.35,-0.06,0.1,-0.0342,-0.02,0.01,-0.08,-0.127,-0.14,-0.00118,0.09,0.01,0.114,0.12,0.01,0.00388,0.114,-0.0234,0.00872,-0.06,-0.03,0.06,0.06,-0.03,0.04,0.00779,-0.02,-0.06,-0.0921,-0.04,-0.05,-0.00277,-0.05,0.0481,0.1,-0.0677,-0.0599,-0.0441,0.0986,-0.0074,0.0294,0.0666,-0.00587,0.0512,0.00348,0.0441,-0.0336,-0.0298,-0.0479,-0.0454,0.00793,-0.0302,0.0257,-0.0587,0.03,-0.00506,-0.0256,-0.0205,-0.0939,0.0391,0.129,-0.02,-0.0672,-0.13,0.05,-0.09,-0.0417,-0.0532,-0.0456,0.00833,0.0259,0.135,0.152,0.173,-0.0884,0.0654,0.0541,0.0294
1,0.66,-0.58,0.15,0.06,-0.04,0.000183,0.17,-0.13,0.06,-0.02,-0.25,-0.07,-0.17,0.0107,0.06,-0.0809,-0.08,0.0059,-0.09,0.03,0.0728,0.03,0.05,-0.02,-0.08,-0.013,0.03,-0.02,0.04,-0.000506,0.04,0.0539,-0.01,-0.01,-0.0326,0.05,-0.02,-0.0154,0.0084,0.00147,-0.0106,-0.04,0.05,0.08,0.03,-0.02,0.05,-0.0203,-0.03,-0.03,-0.0189,-0.01,-0.05,0.0278,0.02,0.0746,0.0324,-0.03,-0.0086,0.0539,0.00334,-0.0285,-0.0826,0.016,0.0641,-0.054,0.0266,-0.00794,-0.0288,0.000612,-0.0504,0.0497,-0.0644,0.0157,-0.0244,-0.0396,0.1,-0.0395,-0.141,0.0643,0.0419,0.0192,-0.0703,0.02,-0.00387,0.09,-0.02,0.05,0.0181,-0.0228,-0.00153,0.014,0.0116,-0.00898,0.0951,0.00244,0.0553,0.0333,-0.0142,0.0434
2,0.73,-0.59,0.15,0.08,-0.06,-0.0108,0.2,-0.16,0.06,-0.02,-0.34,-0.07,-0.24,0.00588,0.11,-0.0367,-0.09,-0.0682,-0.17,-0.05,0.0664,0.05,0.04,-0.24,-0.09,-0.0262,-0.03,-0.01,-0.18,-0.384,-0.02,-0.227,0.1,0.35,-0.186,-0.45,0.05,-0.0109,0.148,0.0317,-0.0925,-0.02,0.03,0.38,-0.14,0.13,0.3,-0.444,0.62,0.42,-0.0857,0.09,-0.12,-0.0498,0.1,-0.0821,0.0242,-0.102,-0.196,-0.0229,-0.188,-0.111,0.243,0.00744,-0.038,-0.181,-0.0232,0.0296,0.155,-0.0853,-0.0448,0.0222,-0.166,0.00972,-0.0106,0.0115,0.13,0.0263,0.0379,0.0246,-0.00693,-0.00594,-0.0548,0.05,0.0627,0.02,-0.04,0.03,0.0561,0.0142,-0.0029,0.0273,-0.0446,-0.0345,0.0319,-0.0564,0.0414,0.0766,0.0104,0.0429
3,0.88,-1.04,0.49,-1.03,-0.03,-0.0889,0.36,-0.51,0.81,-0.04,1.07,0.71,0.22,-0.101,-0.11,0.184,0.09,-0.0275,-0.28,0.05,0.00324,-0.09,0.12,0.19,0.13,0.0521,0.2,-0.56,0.04,-0.132,0.03,-0.092,0.08,-0.17,-0.00926,-0.08,-0.02,-0.00499,0.0707,-0.106,0.0311,0.1,0.21,0.06,0.25,-0.17,0.01,0.0335,0.2,0.12,0.274,-0.35,0.06,-0.169,-0.05,-0.0805,-0.204,-0.0139,0.0075,-0.274,-0.188,-0.0869,-0.0461,-0.251,0.613,-0.0288,0.439,0.434,-0.463,-0.0364,0.416,0.00883,-0.0895,-0.17,0.00337,0.222,-0.26,-0.0393,-0.202,0.0466,-0.0728,0.00696,0.0146,0.14,-0.174,0.02,-0.17,0.04,0.0477,0.00811,0.0291,-0.0806,0.0502,-0.096,-0.162,-0.109,-0.0973,-0.00502,0.000263,-0.0374
4,1.09,-0.96,0.55,-1.65,0.09,-0.0928,-0.03,-0.08,0.32,-0.02,0.78,0.53,0.24,-0.181,-0.11,0.26,0.18,-0.165,-0.3,0.07,0.0387,-0.03,-0.01,0.06,-0.07,0.0282,0.02,0.18,-0.02,0.0475,-0.14,0.136,0.7,-0.59,-0.15,-0.46,-0.13,0.114,-0.0424,-0.0416,0.0393,-0.05,-0.27,0.31,-0.14,0.03,0.06,0.0145,-0.11,-0.02,-0.035,-0.41,-0.08,-0.0646,-0.1,0.00797,-0.00598,-0.00708,0.0821,0.0232,-0.231,-0.14,0.0116,-0.142,0.174,0.0391,0.254,0.25,-0.402,0.0841,0.473,0.172,-0.284,-0.359,-0.0501,0.0488,0.01,0.204,0.298,-0.18,-0.0301,0.0462,0.368,-0.45,0.104,-0.02,0.26,0.12,0.137,0.204,0.05,0.184,-0.0538,-0.214,0.225,-0.0292,0.0384,0.0468,-0.0103,0.166
5,0.66,-0.57,0.15,0.05,-0.03,-0.000856,0.17,-0.12,0.06,-0.02,-0.24,-0.07,-0.16,0.01,0.06,-0.0742,-0.08,0.00928,-0.08,0.03,0.0694,0.03,0.04,-0.01,-0.08,-0.00925,0.03,-0.02,0.03,-0.0115,0.04,0.0527,-0.01,-0.02,-0.0373,0.04,-0.03,-0.00158,-0.00225,-0.00304,-0.00194,-0.02,0.03,0.05,0.03,-0.04,0.02,-0.00509,-0.07,-0.03,-0.00971,-0.01,-0.03,0.0268,-0.02,0.00711,0.0148,-0.0065,0.00757,-0.00594,0.017,0.00276,0.00583,-0.00527,-0.000832,0.00853,0.0133,-0.00866,-0.00248,-0.0162,-0.0074,-0.00236,0.0233,0.000716,0.00738,-0.00723,0.01,-0.000854,-0.0077,0.00383,-0.000255,0.019,0.00354,-0.02,0.000903,-0.01,0.01,-0.02,-0.00224,0.00541,0.00212,0.00269,-0.00247,0.00107,0.00521,0.0256,-0.00178,-0.00106,-0.00195,0.00443


In [29]:
lsi2 = LsiModel(bows, num_topics=2, id2word=vocab, extra_samples=100, power_iters=2)
lsi2

<gensim.models.lsimodel.LsiModel at 0x7fcedabae6d0>

In [30]:
lsi.save(os.path.join(DATA_PATH, 'lsi100'))
lsi2.save(os.path.join(DATA_PATH, 'lsi2'))

## Hold onto your hat
This will take a lot of RAM!  
(and CPU)  

In [31]:
tweetids = pd.Series(range(len(bows)), name='tweet')
topicids = pd.Series(range(lsi.num_topics), name='topic')
# `dict()` keeps track of the columns for each topic, in case the lsi model shuffles or skips topics for odd tweets
df = pd.DataFrame([pd.Series(dict(lsi[bows[i]]), name='tweet') for i in tweetids],
                  columns=topicids,
                  index=tweetids)

In [32]:
df

topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
0,0.28,0.65,-0.16,-0.33,0.02,-1.45e-02,0.48,-0.15,0.24,0.06,-0.12,-0.10,-0.14,1.24e-01,0.19,2.88e-03,0.07,-2.61e-02,0.25,0.69,0.42,0.10,0.35,-5.55e-02,0.10,-0.03,-0.02,0.01,-0.08,-1.27e-01,-0.14,-1.18e-03,0.09,0.01,0.11,0.12,0.01,3.88e-03,1.14e-01,-2.34e-02,8.72e-03,-0.06,-3.45e-02,0.06,0.06,-0.03,0.04,7.79e-03,-0.02,-0.06,-9.21e-02,-0.04,-0.05,-2.77e-03,-4.78e-02,0.05,0.10,-0.07,-5.99e-02,-4.41e-02,9.86e-02,-7.40e-03,0.03,6.66e-02,-5.87e-03,5.12e-02,3.48e-03,4.41e-02,-0.03,-2.98e-02,-0.05,-0.05,7.93e-03,-3.02e-02,2.57e-02,-0.06,0.03,-5.06e-03,-2.56e-02,-0.02,-9.39e-02,3.91e-02,0.13,-0.02,-6.72e-02,-0.13,0.05,-0.09,-0.04,-5.32e-02,-4.56e-02,8.33e-03,2.59e-02,1.35e-01,0.15,1.73e-01,-0.09,0.07,0.05,0.03
1,0.66,-0.58,0.15,0.06,-0.04,1.83e-04,0.17,-0.13,0.06,-0.02,-0.25,-0.07,-0.17,1.07e-02,0.06,-8.09e-02,-0.08,5.90e-03,-0.09,0.03,0.07,0.03,0.05,-2.42e-02,-0.08,-0.01,0.03,-0.02,0.04,-5.06e-04,0.04,5.39e-02,-0.01,-0.01,-0.03,0.05,-0.02,-1.54e-02,8.40e-03,1.47e-03,-1.06e-02,-0.04,5.33e-02,0.08,0.03,-0.02,0.05,-2.03e-02,-0.03,-0.03,-1.89e-02,-0.01,-0.05,2.78e-02,2.32e-02,0.07,0.03,-0.03,-8.60e-03,5.39e-02,3.34e-03,-2.85e-02,-0.08,1.60e-02,6.41e-02,-5.40e-02,2.66e-02,-7.94e-03,-0.03,6.12e-04,-0.05,0.05,-6.44e-02,1.57e-02,-2.44e-02,-0.04,0.10,-3.95e-02,-1.41e-01,0.06,4.19e-02,1.92e-02,-0.07,0.02,-3.87e-03,0.09,-0.02,0.05,0.02,-2.28e-02,-1.53e-03,1.40e-02,1.16e-02,-8.98e-03,0.10,2.44e-03,0.06,0.03,-0.01,0.04
2,0.73,-0.59,0.15,0.08,-0.06,-1.08e-02,0.20,-0.16,0.06,-0.02,-0.34,-0.07,-0.24,5.88e-03,0.11,-3.67e-02,-0.09,-6.82e-02,-0.17,-0.05,0.07,0.05,0.04,-2.43e-01,-0.09,-0.03,-0.03,-0.01,-0.18,-3.84e-01,-0.02,-2.27e-01,0.10,0.35,-0.19,-0.45,0.05,-1.09e-02,1.48e-01,3.17e-02,-9.25e-02,-0.02,3.06e-02,0.38,-0.14,0.13,0.30,-4.44e-01,0.62,0.42,-8.57e-02,0.09,-0.12,-4.98e-02,1.01e-01,-0.08,0.02,-0.10,-1.96e-01,-2.29e-02,-1.88e-01,-1.11e-01,0.24,7.44e-03,-3.80e-02,-1.81e-01,-2.32e-02,2.96e-02,0.15,-8.53e-02,-0.04,0.02,-1.66e-01,9.72e-03,-1.06e-02,0.01,0.13,2.63e-02,3.79e-02,0.02,-6.93e-03,-5.94e-03,-0.05,0.05,6.27e-02,0.02,-0.04,0.03,0.06,1.42e-02,-2.90e-03,2.73e-02,-4.46e-02,-3.45e-02,0.03,-5.64e-02,0.04,0.08,0.01,0.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183067,1.43,0.12,-0.84,-0.15,-0.25,-1.56e-01,0.27,-0.58,0.48,-0.18,-1.24,0.63,1.15,4.92e-01,-0.63,1.37e-01,0.13,1.92e-01,-0.05,0.05,-0.02,0.22,0.21,-3.17e-02,-0.08,-0.18,0.09,0.04,0.06,-6.53e-02,0.08,1.47e-01,0.01,0.06,-0.04,0.11,-0.04,-2.39e-02,6.61e-02,-9.21e-02,-9.32e-02,0.06,-3.46e-03,0.16,0.05,0.02,0.08,2.65e-02,-0.09,-0.07,-2.46e-03,0.02,0.04,1.63e-02,-7.69e-02,0.04,0.04,-0.06,1.95e-02,-6.10e-02,-2.33e-02,-1.75e-02,0.01,-1.38e-02,5.04e-02,-3.35e-02,4.59e-02,-4.45e-02,0.06,8.02e-02,-0.04,0.04,4.99e-02,-3.60e-03,4.03e-02,0.03,0.04,2.95e-02,-6.37e-02,-0.07,-6.74e-02,-3.98e-02,0.05,-0.06,-5.73e-02,0.03,0.05,0.02,0.02,9.91e-03,1.63e-02,7.45e-02,2.04e-03,6.69e-02,0.03,3.06e-02,-0.05,-0.03,0.10,-0.08
183068,0.66,-0.58,0.15,0.06,-0.04,1.83e-04,0.17,-0.13,0.06,-0.02,-0.25,-0.07,-0.17,1.07e-02,0.06,-8.09e-02,-0.08,5.90e-03,-0.09,0.03,0.07,0.03,0.05,-2.42e-02,-0.08,-0.01,0.03,-0.02,0.04,-5.06e-04,0.04,5.39e-02,-0.01,-0.01,-0.03,0.05,-0.02,-1.54e-02,8.40e-03,1.47e-03,-1.06e-02,-0.04,5.33e-02,0.08,0.03,-0.02,0.05,-2.03e-02,-0.03,-0.03,-1.89e-02,-0.01,-0.05,2.78e-02,2.32e-02,0.07,0.03,-0.03,-8.60e-03,5.39e-02,3.34e-03,-2.85e-02,-0.08,1.60e-02,6.41e-02,-5.40e-02,2.66e-02,-7.94e-03,-0.03,6.12e-04,-0.05,0.05,-6.44e-02,1.57e-02,-2.44e-02,-0.04,0.10,-3.95e-02,-1.41e-01,0.06,4.19e-02,1.92e-02,-0.07,0.02,-3.87e-03,0.09,-0.02,0.05,0.02,-2.28e-02,-1.53e-03,1.40e-02,1.16e-02,-8.98e-03,0.10,2.44e-03,0.06,0.03,-0.01,0.04
183069,0.68,-0.61,0.17,-0.02,-0.03,-4.67e-03,0.19,-0.15,0.10,-0.02,-0.16,-0.02,-0.13,5.10e-03,0.06,-6.46e-02,-0.09,1.04e-02,-0.03,0.02,0.06,0.03,0.05,5.36e-03,-0.07,-0.01,0.04,-0.07,0.04,-3.81e-02,0.06,5.59e-03,-0.02,-0.04,-0.02,0.05,0.02,-2.08e-02,5.03e-02,-2.35e-02,6.21e-03,0.05,2.02e-01,-0.02,0.17,-0.12,-0.01,-2.01e-03,0.04,-0.02,1.11e-01,0.06,0.04,-4.63e-02,8.09e-03,-0.07,-0.09,0.04,1.58e-02,-6.30e-03,8.22e-02,-1.50e-03,0.11,-4.07e-02,-9.42e-02,-6.63e-03,-2.56e-02,-7.05e-02,0.05,1.81e-02,-0.02,0.03,-3.41e-02,7.00e-02,2.24e-03,-0.03,0.10,1.03e-01,4.00e-03,0.03,-1.28e-02,2.59e-02,0.06,-0.04,3.56e-02,0.03,0.02,0.03,0.05,3.59e-02,1.32e-02,2.59e-02,3.40e-02,-2.45e-02,0.05,2.24e-02,0.02,-0.04,0.03,0.06


What's with the 1.43?  
Aren't they normalize?  
... Nope  

In [33]:
scatmat(df[df.columns[:5]][::100])

TypeError: range() integer end argument expected, got float.

In [None]:
num

In [None]:
with gzip.open(os.path.join(DATA_PATH, 'tweet_topic_vectors.csv.gz'), 'wb') as f:
    df.to_csv(f, encoding='utf8', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)

We built LSI topic vectors for 200k tweets in a few minutes!  
Lets look at the TFIDF vectors for the top 6 tweets

In [None]:
tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6)))
tfidf6 = tfidf6.fillna('')
tfidf6

Notice the small weights on the word "Python"?
Why do you think that is?
(Think back to the definition of TF and DF and TFIDF

Now lets see how far apart they are based only on word frequency (TFIDF)
We'll *"project"* the first tweet onto the second with a dot product  
to see how much of a "shadow" they make on each other  

In [None]:
tfidf6 = pd.DataFrame((dict([(vocab[i], freq) for i, freq in tfidf[bows[j]]]) for j in range(6))).fillna(0).T

In [None]:
np.dot(tfidf6[0], tfidf6[1])

In [None]:
np.dot(tfidf6[1], tfidf6[2])

That looks about right.  
The first 2 share no words.  
The second 2 share only "Python".  
But lets do the cosine similarity correctly by normalizing for length.  

In [None]:
np.dot(tfidf6[1], tfidf6[2]) / np.linalg.norm(tfidf6[1]) / np.linalg.norm(tfidf6[2])

Hmmm, nothing changed  
Can you guess why?  

In [None]:
[round(np.dot(tfidf6[i], tfidf6[i+1]), 4) for i in range(5)]

In [None]:
Now lets look at the topic vectors.  


In [125]:
df.iloc[:6]

topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
tweet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
0,0.28,0.65,-0.16,-0.33,0.02,-0.0145,0.48,-0.15,0.24,0.06,-0.12,-0.09,-0.14,0.122,0.19,0.00311,0.07,-0.0262,0.25,0.69,0.422,0.1,0.35,-0.06,0.1,-0.0333,-0.02,0.01,-0.08,-0.129,-0.14,0.00017,0.09,0.02,-0.109,0.13,0.0098,0.00433,0.114,0.024,0.00873,-0.06,-0.04,0.06,0.06,-0.03,0.0372,0.000747,-0.02,-0.06,-0.1,-0.0184,-0.06,-0.00953,-0.05,0.0652,0.0912,-0.0646,-0.065,-0.0185,0.0949,-0.0086,0.0195,0.0624,0.0134,0.041,-0.00874,0.0443,-0.0296,-0.0497,-0.0303,-0.0685,-0.03,-0.00653,0.0197,-0.065,0.03,0.00762,-0.08,-0.00206,-0.0323,-0.05,0.0741,0.0668,-0.0145,-0.13,0.0338,-0.14,-0.052,0.0653,0.0435,0.106,0.0476,-0.162,0.2,0.05,-0.0402,-0.0839,0.146,0.0102
1,0.66,-0.58,0.15,0.06,-0.04,0.000174,0.17,-0.13,0.06,-0.02,-0.25,-0.07,-0.17,0.0106,0.06,-0.0807,-0.08,0.0067,-0.09,0.03,0.0724,0.03,0.05,-0.02,-0.08,-0.0131,0.03,-0.02,0.04,2.87e-05,0.04,0.0537,-0.01,-0.01,0.034,0.05,-0.017,-0.0152,0.00876,-0.00144,-0.0154,-0.04,0.05,0.08,0.03,-0.02,0.0434,-0.0176,-0.03,-0.02,-0.02,-0.00898,-0.05,0.0213,0.02,0.0826,0.0155,-0.0262,-0.00143,0.0579,0.00135,-0.0504,-0.0788,0.00239,0.0505,-0.0495,0.0367,-0.00773,-0.0367,0.00043,-0.0608,0.0292,0.07,0.0134,-0.021,-0.0342,0.0852,-0.0248,-0.08,-0.00841,0.0292,0.02,-0.112,-0.0191,-0.0197,0.07,0.000565,0.06,0.00959,-0.00716,0.0149,-0.00636,0.00717,-0.0338,0.1,-0.04,0.0281,0.0448,0.0388,0.00053
2,0.73,-0.59,0.15,0.08,-0.06,-0.0107,0.2,-0.16,0.06,-0.02,-0.34,-0.07,-0.24,0.00564,0.11,-0.0367,-0.09,-0.0677,-0.17,-0.05,0.0662,0.05,0.04,-0.24,-0.09,-0.0267,-0.03,-0.01,-0.18,-0.383,-0.01,-0.229,0.1,0.34,0.171,-0.46,0.0517,-0.00479,0.152,-0.0218,-0.11,-0.03,0.02,0.37,-0.14,0.19,0.205,-0.582,0.58,0.34,-0.03,0.108,-0.12,-0.0586,0.12,-0.0797,0.0355,-0.0848,-0.2,0.0053,-0.197,-0.0481,0.26,0.043,0.00547,-0.17,-0.0566,-0.00238,0.113,-0.12,-0.0716,0.0238,0.16,-0.0281,-0.0151,-0.0106,0.13,-0.00274,0.06,0.02,-0.0169,0.01,-0.0821,0.00491,0.0198,0.03,-0.0438,0.06,0.0537,-0.0516,-0.0217,-0.0444,-0.00687,0.0223,-0.01,-0.05,0.0984,0.0553,0.00236,-0.0311
3,0.88,-1.04,0.49,-1.03,-0.03,-0.0888,0.36,-0.51,0.81,-0.04,1.07,0.71,0.22,-0.101,-0.11,0.182,0.09,-0.028,-0.28,0.05,0.00365,-0.09,0.12,0.19,0.13,0.0554,0.21,-0.56,0.03,-0.131,0.03,-0.0944,0.09,-0.17,0.00643,-0.09,-0.0147,-0.00197,0.0697,0.0963,0.0331,0.1,0.21,0.06,0.24,-0.18,0.00301,-0.0187,0.21,0.1,0.19,-0.398,0.08,-0.161,-0.03,-0.141,-0.185,-0.0522,-0.00744,-0.239,-0.192,-0.0978,-0.0408,-0.338,0.642,0.101,0.232,0.654,-0.312,0.0234,0.416,0.0278,0.1,-0.134,-0.00544,0.238,-0.19,0.0362,-0.23,-0.0757,-0.115,0.02,-0.0838,0.126,-0.172,0.12,-0.153,0.06,0.0482,-0.0188,0.0483,0.0319,-0.0305,0.115,-0.16,0.02,0.0343,-0.0469,-0.0353,-0.00431
4,1.09,-0.96,0.55,-1.65,0.09,-0.0928,-0.03,-0.08,0.32,-0.02,0.78,0.53,0.24,-0.181,-0.11,0.258,0.18,-0.165,-0.3,0.07,0.039,-0.03,-0.01,0.06,-0.07,0.0289,0.02,0.18,-0.02,0.0417,-0.14,0.141,0.7,-0.59,0.129,-0.47,-0.133,0.11,-0.0421,0.0507,0.0391,-0.05,-0.26,0.32,-0.14,0.01,0.0716,0.0508,-0.11,0.02,-0.13,-0.392,-0.09,-0.0611,-0.08,-0.00397,-0.00701,-0.0164,0.0866,0.0142,-0.228,-0.117,0.0589,-0.194,0.192,0.106,0.189,0.436,-0.277,0.168,0.437,0.152,0.26,-0.413,-0.0136,-0.0299,0.00508,0.112,0.2,0.139,0.152,-0.13,0.5,-0.201,0.321,-0.02,0.314,0.07,0.159,0.131,-0.098,-0.218,-0.0798,0.0831,0.24,-0.1,0.0434,0.0395,0.193,-0.0391
5,0.66,-0.57,0.15,0.05,-0.03,-0.000851,0.17,-0.12,0.06,-0.02,-0.24,-0.07,-0.16,0.00986,0.06,-0.074,-0.08,0.00962,-0.08,0.03,0.0694,0.03,0.04,-0.01,-0.08,-0.00879,0.03,-0.02,0.03,-0.0113,0.04,0.0525,-0.01,-0.02,0.0391,0.04,-0.0284,-0.00263,-0.00317,0.00173,-0.00328,-0.02,0.03,0.05,0.03,-0.04,0.023,0.00261,-0.07,-0.02,-0.01,-0.0105,-0.03,0.0239,-0.02,0.00869,0.0135,-0.00708,0.00732,-0.00521,0.0181,0.00469,0.00564,-0.00516,-0.000606,0.00493,0.0167,-0.00316,-0.00326,-0.017,-0.00187,0.000605,-0.03,0.00569,0.00501,-0.0068,0.0111,0.00574,-0.01,0.0145,0.00109,-0.01,-0.00212,-0.0136,0.00886,-0.01,0.00595,-0.02,0.00265,0.00724,-0.00299,-0.00286,0.00329,-0.00338,0.02,0.01,-0.00195,-0.00926,0.00224,-0.00817


In [122]:
print([round(np.dot(df.T[i], df.T[i+1]), 4) for i in range(5)])

[0.0105, 1.1037, 0.9981, 6.452, 1.0153]


Better normalize these...

In [123]:
print([round(np.dot(df.T[i], df.T[i+1]) / np.linalg.norm(df.T[i]) / np.linalg.norm(df.T[i+1]), 4) for i in range(5)])
# for comparison the TFIDF scores right below
print([round(np.dot(tfidf6[i], tfidf6[i+1]), 4) for i in range(5)])

[0.0066, 0.5742, 0.1849, 0.6925, 0.325]
[0.0, 0.001, 0.0009, 0.1673, 0.0005]


So the really chummy neighbors are 1 & 2 and 3 & 4  
Surprisingly 2 & 3 didn't hit it off, and no pairing got a zero!   
And the last 2 seem to share a "latent" similarity that TFIDF missed entirely!!!
And LSI picked up on the python<->Python similarity (tweets 0 and 1)

In [133]:
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    text = pd.DataFrame.from_csv(f, encoding='utf8')

In [188]:
for toks, twt in zip(text.txt.iloc[:6], text.text.iloc[:6]):
    print(toks)
    print(twt)
    print('-' * 10)
    

 python never stop learning what you enjoy doing 
#python never stop learning what you enjoy doing. https://t.co/IH5ZSKnU8K
----------
Watching Boa vs Python 
Watching Boa vs. Python — https://t.co/Pivpk02s2A
----------
Monty Python The silly walk via YouTube
Monty Python - The silly walk https://t.co/C0Ja8UHL4t via @YouTube
----------
Senior Software Engineer Full Stack Python Django And Php Jobs jobs jobsearch 
Senior Software Engineer Full Stack Python Django And Php Jobs #jobs #jobsearch https://t.co/EuO3Et4JIT
----------
Architect Django Solr Platform Engineer With Python k Jobs in Manhattan NY Manhattan NY jobs jobsearch 
Architect Django Solr Platform Engineer With Python 230k Jobs in Manhattan, NY #Manhattan #NY #jobs #jobsearch https://t.co/ge0RzBDoSP
----------
peaceful rain Python inevitability
peaceful rain? Python - inevitability
----------


What about a new tweet you are considering?  
Notice how I changed the token spelling (BOW),  
but not the *"semantics"* of the tweet.  

In [169]:
tweet = 'I want to help build django with a job in Chicago'
tweet_bow = vocab.doc2bow(tweet.split())
tweet_tfidf = tfidf[tweet_bow]
tweet_topics = pd.Series(dict(lsi[tweet_tfidf]))
# Now that the math is done let's convert to a friendlier format with words as the keys/index
tweet_tfidf = pd.Series(dict([(vocab[i], x) for (i, x) in tweet_tfidf])) 
print('\nLSI Topic Vector')
tweet_topics


LSI Topic Vector


0     1.41e-01
1     1.18e-01
2     1.45e-01
        ...   
97   -8.82e-03
98    2.28e-02
99   -5.27e-03
dtype: float64

Compare the topic vector above to the TFIDF vector below.  
What's better about TFIDF compared to topic vectors?  
What can we do about it?  

In [170]:
print('TFIDF Frequency Vector')
print(tweet_tfidf)

TFIDF Frequency Vector
Chicago    0.45
I          0.18
a          0.13
           ... 
to         0.12
want       0.37
with       0.16
dtype: float64


Which one is it closest too?  
Can you guess?  
Does LSI understand the words as well as you do?  

In [167]:
print('LSI Topic Similarity')
print([round(np.dot(df.T[i], tweet_topics) / np.linalg.norm(df.T[i]) / np.linalg.norm(tweet_topics), 4) for i in range(6)])

LSI Topic Similarity
[0.0716, -0.014, 0.0025, 0.0716, 0.1484, -0.003]


In [184]:
tfidf7 = tfidf6.copy()
tfidf7[6] = tweet_tfidf
tfidf7 = tfidf7.fillna(0)
tfidf7

Unnamed: 0,0,1,2,3,4,5,6
And,0.00,0.00,0.00,0.35,0.00,0.0,0.0
Architect,0.00,0.00,0.00,0.00,0.25,0.0,0.0
Boa,0.00,0.62,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...
walk,0.00,0.00,0.46,0.00,0.00,0.0,0.0
what,0.32,0.00,0.00,0.00,0.00,0.0,0.0
you,0.20,0.00,0.00,0.00,0.00,0.0,0.0


In [186]:
print([round(np.dot(tfidf7[i], tfidf7[6]), 4) for i in range(6)])

[0.0, 0.0, 0.0, 0.0, 0.0076, 0.0]


In [187]:
tweet

'I want to help build django with a job in Chicago'

Can you find the one word I accidentally share with the other tweets?  
*Hint: use the TFIDF matrix (Dataframe)*  
Play around with the tweet text to make its topic vector more *"orthogonal"*  
Or make it closer in cosine distance.  