In [3]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd
from math import log

import psycopg2
import spacy

# from here -- https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import string
import re


In [18]:
from sklearn.metrics.pairwise import linear_kernel

In [51]:
from timeit import timeit

tasks:
1. compare sklearn cosine_sim to homebaked function
2. iterate homebaked function on a matrix to produce output

#### LEARNED: We might be able to speed things up by pre-norming the tfidf matrix and then using the linear kernel to calculate the cosign sims.

But why is my home-baked function not working?

## DUH!
#### I needed to enclose the denominator in parentheses.

In [33]:
post_vec = np.array([0,.76908,0,0,0,0.1235, 0,0,.087659, .265789]).reshape(1, -1)
j_vec = np.array([0,0.0768,0,0,.789,0, 0,0,.087659, .265789]).reshape(1, -1)
pvn = norm(post_vec)
jn = norm(j_vec)
p = post_vec / pvn
j = j_vec / jn

((1, 10), (1, 10))

In [41]:
def cos_sim(post_v, j_v):
    """calculates cosign similarity between vectorized post and journal entry"""
    return post_v.dot(j_v) / (norm(post_v) * norm(j_v))

In [53]:
cos_sim(post_vec, j_vec.T)

array([[ 0.1974548]])

In [23]:
cosine_similarity(post_vec, j_vec)

array([[ 0.1974548]])

In [34]:
linear_kernel(p, j)

array([[ 0.1974548]])

##### So we can speed things up by:
* Pre-norming the tfidf vector and pickling that
* Norming the journal vector
* Using sklearn.linear_kernel instead of sklearn.cosine_similarity
* Also need to place timers

#### We will know we are on track when the same journal entry returns identical results with both methods.

Uh oh! Couldn't do it! I didn't have enough horsepower to pre-norm the vector! Try on EC2?

In [43]:
# How to norm the rows of a matrix
testM = np.array([0,0,0,.5,.2,.6,0,0,0]).reshape(3,3)

In [55]:
q = testM / norm(testM, axis=0)

In [56]:
q

array([[ 0.,  0.,  0.],
       [ 1.,  1.,  1.],
       [ 0.,  0.,  0.]])

In [2]:
import loseit_nlp as LNLP

In [19]:
# Snag a sample from the database
conn = psycopg2.connect(dbname='loseit', host='localhost')
cur = conn.cursor()

query = """
SELECT post_id
  , topic_id
  , topic
  , name
  , author_id
  , date_posted
  , post_body
  , library
  , page_url
FROM posts
ORDER BY post_idx
LIMIT 10000;
"""

cur.execute(query)
samples = cur.fetchall()

In [20]:
len(samples)

10000

In [48]:
# 2.5 minutes for 10,000 samples
df, arr, model = LNLP.corpus_df(LNLP.TFIDFER, samples)

In [54]:
arr.shape, type(arr)

((10000, 3537), scipy.sparse.csr.csr_matrix)

In [50]:
import cPickle as pickle

In [101]:
with open ('corpus_10K.p', 'wb') as f:
    pickle.dump((df, arr), f) # this makes file with 2.2GB holy shit

In [102]:
with open ('corpus_10K.p', 'wb') as f:
    pickle.dump(arr, f) # dense array only, half the size. Nice!

In [9]:
with open ('corpus_10K.p', 'wb') as f:
    pickle.dump(arr, f) # sparse array only; holy crap. Only 12 MB. That seems like the way to do it.

In [52]:
with open('model.pickle', 'wb') as f:
    pickle.dump(model, f) # trying again to pickle model
    # OMG, it worked!!!
    # NOW we are in BUSINESS!!

In [18]:
type(model)

sklearn.feature_extraction.text.TfidfVectorizer

In [15]:
df.iloc[:3, :5]

Unnamed: 0,limited,valve,forget,pardon,represent
43548-1,0.0,0.0,0.0,0.0,0.0
43198-1,1.764973,0.0,0.0,0.0,0.0
43743-1,0.0,0.0,0.0,0.0,0.0


In [77]:
type(model)

sklearn.feature_extraction.text.TfidfVectorizer

In [22]:
j_entry = """
I think I'm falling off the wagon because I am not getting much exercise. I'm under a lot of stress at work, and I keep binge eating when I get home at night. I'm tired and not getting as much sleep as I would like. And sometimes, logging foods just feels tedious.
"""

In [23]:
test_run = LNLP.MatchPosts(df, model, j_entry, N=2)

In [24]:
X = test_run.match_posts()

In [25]:
X

Unnamed: 0,cos_sim
27610-1,0.366836
35928-1,0.348755


In [26]:
from post_retriever import OutputResults

In [44]:
test_print = OutputResults(X)

In [46]:
test_print.print_results()


Post_ID: 27610-1, Topic: Exersice or Sleep: Which is more important??

Posted by: Paige on 12/13/2012

Over the past few months, I have been insanely busy. My average night's sleep in 4 hours (sometimes just 1 hour), and I will try to get a half hour nap in when I can.

 I started working in NYC from NJ back in September, so I am out of the house for work from 7am-7pm. I also started grad school in September. I am taking double credits so that I can achieve my MBA degree within a year. I feel like I never sleep. I always have work that needs to get done, and I am always ALWAYS stressed out. Luckily this semester is almost over. My finals are next week!

 With this schedule, I try to work out at least twice a week, and some weeks I've been able to get in 5 days at the gym, but I just can't seem to lose any weight. I convince myself that I can just go to the gym for an hour and then my reward is going to bed. Of course after the working out, I can't sleep! Waking up is the worst right n

In [97]:
list(X.index)

['27610-1', '35928-1', '26144-1', '26917-1', '8498-1']

In [82]:
test_run.j_M

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [83]:
test_run.archive_df[:3]

Unnamed: 0,?,!,calorie,weight,lose
3203-1,0.0,1.446856,0.0,0.0,0.0
3105-1,0.0,1.446856,0.0,0.0,0.0
5408-1,0.0,0.0,0.0,2.098812,0.0


In [40]:
test_vec = test_run.journal_tfidf()
test_vec

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  2.09861229,
         2.09861229,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.84729786,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ]])

In [43]:
test_tile = test_run.tile_j_M()
test_tile.shape

(20, 39)

In [45]:
sims = test_run.cos_sim_vector()

In [55]:
test_res = test_run.results()
test_res

Unnamed: 0,cos_sim
36492-7,0.341849
36492-5,0.326868
36492-10,0.294591


In [54]:
norms = norm_array(samples)

NameError: name 'norm_array' is not defined

In [20]:
post_ids = [tup[0] for tup in norms]
norm_figs = [tup[1] for tup in norms]
cols = ['post_id', 'vec_norm']

df = pd.DataFrame(data=norms, columns=cols)

In [66]:
# when you dot multiply two vectors, does the order matter? Apparently not.
a = np.array([2, 5, 6, 7, 8])
b = np.array([45, 36, 45, 78, 94])
a.dot(b), b.dot(a)

(1838, 1838)

In [79]:
# Now, let's add dimensionality, like our situation where a = tiled j
j = np.array([[2, 5, 0, 7, 8], [2, 5, 0, 7, 8], [2, 5, 0, 7, 8]])
b = np.array([[45, 36, 45, 0, 94],[33, 33, 0, 34, 54], [0, 0 ,0, 0, 0]] )
print j.dot(b.T)
print '\n'
print b.dot(j.T)
# It does matter. Although the values are the same, the results are rotated
# 90 degrees
# The only thing left now is to figure out which numbers match which
'''
CONCLUSION: We like multiplying the archive.dot(journal_entry), then taking
any COLUMN as the cosign similarities between each row and the journal.

1. So, to pull this off, we need to tile our vectorized journal entry into
a matrix that is the same entire size as the archive...

2. Run both matrices through sklearn cosine_similarity

3. Harvest any column and add that to the dataframe

4. Retrieve the post ids closest to 1.0

5. Retrieve the cleaned up post body or url.

'''

[[1022  901    0]
 [1022  901    0]
 [1022  901    0]]


[[1022 1022 1022]
 [ 901  901  901]
 [   0    0    0]]


In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
this = np.random.choice(range(2,15), size=(4, 50))
that = np.random.choice(range(2, 15), size=(1, 50))


In [56]:
# make an array of repeated rows
yo = np.tile(that, 4).reshape(4,50)
yo

array([[ 8, 14, 13,  9, 12, 10,  4,  5,  3, 14, 11,  9, 11, 13, 13, 14,  9,
         6,  4, 13,  9, 13, 10,  7,  9,  3,  6, 12,  5,  2,  2,  8, 11,  8,
        13,  2,  3,  6,  2, 13,  6, 14,  7, 13,  2,  7, 12,  9, 13,  3],
       [ 8, 14, 13,  9, 12, 10,  4,  5,  3, 14, 11,  9, 11, 13, 13, 14,  9,
         6,  4, 13,  9, 13, 10,  7,  9,  3,  6, 12,  5,  2,  2,  8, 11,  8,
        13,  2,  3,  6,  2, 13,  6, 14,  7, 13,  2,  7, 12,  9, 13,  3],
       [ 8, 14, 13,  9, 12, 10,  4,  5,  3, 14, 11,  9, 11, 13, 13, 14,  9,
         6,  4, 13,  9, 13, 10,  7,  9,  3,  6, 12,  5,  2,  2,  8, 11,  8,
        13,  2,  3,  6,  2, 13,  6, 14,  7, 13,  2,  7, 12,  9, 13,  3],
       [ 8, 14, 13,  9, 12, 10,  4,  5,  3, 14, 11,  9, 11, 13, 13, 14,  9,
         6,  4, 13,  9, 13, 10,  7,  9,  3,  6, 12,  5,  2,  2,  8, 11,  8,
        13,  2,  3,  6,  2, 13,  6, 14,  7, 13,  2,  7, 12,  9, 13,  3]])

In [76]:
sims = cosine_similarity(yo, this)
sims2 = cosine_similarity(this, yo)
print sims
print '\n'
print sims2
print '\n'
print np.rot90(sims)[:,0]

[[ 0.78957739  0.849316    0.77042722  0.80133563]
 [ 0.78957739  0.849316    0.77042722  0.80133563]
 [ 0.78957739  0.849316    0.77042722  0.80133563]
 [ 0.78957739  0.849316    0.77042722  0.80133563]]


[[ 0.78957739  0.78957739  0.78957739  0.78957739]
 [ 0.849316    0.849316    0.849316    0.849316  ]
 [ 0.77042722  0.77042722  0.77042722  0.77042722]
 [ 0.80133563  0.80133563  0.80133563  0.80133563]]


[ 0.80133563  0.77042722  0.849316    0.78957739]


In [26]:
j = journal_norm()

What's up?I'm tired. I have not been working out, and I'm already hanging off the wagon.


In [36]:
df['vec_norm']

0     11.448127
1     11.539718
2      9.398894
3     26.290084
4     18.403477
5     18.862995
6      7.370715
7     25.845860
8      4.901380
9     17.106235
10    10.851570
11    15.585636
12     3.119232
13     8.331229
14    25.023286
15    33.481427
16    11.492370
17     6.319651
18    19.891184
19     8.096383
Name: vec_norm, dtype: float64

In [219]:
STOPLIST

{"'m",
 "'s",
 u'a',
 u'about',
 u'above',
 'across',
 u'after',
 'afterwards',
 u'again',
 u'against',
 u'ain',
 u'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 u'am',
 'among',
 'amongst',
 'amoungst',
 'amount',
 u'an',
 u'and',
 'another',
 u'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 u'are',
 u'aren',
 'around',
 u'as',
 u'at',
 'back',
 u'be',
 'became',
 u'because',
 'become',
 'becomes',
 'becoming',
 u'been',
 u'before',
 'beforehand',
 'behind',
 u'being',
 u'below',
 'beside',
 'besides',
 u'between',
 'beyond',
 'bill',
 u'both',
 'bottom',
 u'but',
 u'by',
 'ca',
 'call',
 u'can',
 'cannot',
 'cant',
 'co',
 'con',
 'could',
 u'couldn',
 'couldnt',
 'cry',
 u'd',
 'de',
 'describe',
 'detail',
 u'did',
 u'didn',
 u'do',
 u'does',
 u'doesn',
 u'doing',
 u'don',
 'done',
 u'down',
 'due',
 u'during',
 u'each',
 'eg',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'etc',
 'even',
 'ever'

In [227]:
# make a list of all the text samples, tuple(post_id, unicode.body)
wagon_thread = [(sample[0], sample[6].decode('utf-8')) for sample in samples] 
len(wagon_thread)

20

In [228]:
samp_post = wagon_thread[0][1]
samp_post

u'I\'m one of THOSE people - I get all jazzed up about losing weight and exercising, I get my plan all set in place (I love lists and charts), then when it comes to actually DOING it, after about a day, 3 days, a week, I fail. I\'ll have a mountain dew, I\'ll skip a workout, I\'ll order a burger instead of fish, etc.\n\n Now, failure is a part of life. These things happen. Intellectually, I understand this. Emotionally, I do not. As soon as I "screw up", I (figuratively) throw my hands in the air and give up on it all. Suffering from perfectionism, I see this one black mark and decide that everything is ruined, I\'m a failure, I\'ll never achieve my goals - and therefore eat whatever I want, whenever I want, and sit on the couch, binge-watching TV on Netflix in order to escape the feelings of despair, disappointment, and helplessness that come with failure.\n\n I need advice - how do you accept failure? How do you forgive yourself when you "screw up"? How do you pick yourself back up a

In [229]:
cleaned = cleanText(samp_post)
cleaned

u'i am one of those people - i get all jazzed up about losing weight and exercising, i get my plan all set in place (i love lists and charts), then when it comes to actually doing it, after about a day, 3 days, a week, i fail. i will have a mountain dew, i will skip a workout, i will order a burger instead of fish, etc.   now, failure is a part of life. these things happen. intellectually, i understand this. emotionally, i do not. as soon as i "screw up", i (figuratively) throw my hands in the air and give up on it all. suffering from perfectionism, i see this one black mark and decide that everything is ruined, i am a failure, i will never achieve my goals - and therefore eat whatever i want, whenever i want, and sit on the couch, binge-watching tv on netflix in order to escape the feelings of despair, disappointment, and helplessness that come with failure.   i need advice - how do you accept failure? how do you forgive yourself when you "screw up"? how do you pick yourself back up a

In [230]:
tokenized_samp = tokenizeText(cleaned)

In [231]:
tokenized_samp

[u'people',
 u'jazz',
 u'lose',
 u'weight',
 u'exercise',
 u'plan',
 u'set',
 u'place',
 u'love',
 u'list',
 u'chart',
 u'come',
 u'actually',
 u'day',
 u'3',
 u'day',
 u'week',
 u'fail',
 u'mountain',
 u'dew',
 u'skip',
 u'workout',
 u'order',
 u'burger',
 u'instead',
 u'fish',
 u'failure',
 u'life',
 u'thing',
 u'happen',
 u'intellectually',
 u'understand',
 u'emotionally',
 u'soon',
 u'screw',
 u'figuratively',
 u'throw',
 u'hand',
 u'air',
 u'suffer',
 u'perfectionism',
 u'black',
 u'mark',
 u'decide',
 u'ruin',
 u'failure',
 u'achieve',
 u'goal',
 u'eat',
 u'whatev',
 u'want',
 u'want',
 u'sit',
 u'couch',
 u'binge',
 u'watch',
 u'tv',
 u'netflix',
 u'order',
 u'escape',
 u'feeling',
 u'despair',
 u'disappointment',
 u'helplessness',
 u'come',
 u'failure',
 u'ne',
 u'advice',
 u'accept',
 u'failure',
 u'?',
 u'forgive',
 u'screw',
 u'?',
 u'pick',
 u'wagon',
 u'?']

In [234]:
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))

sklearn.feature_extraction.text.CountVectorizer

In [243]:
# Started our pipeline. How do I feed my collection of raw texts into it?
pipe = Pipeline([
    ('cleanText', CleanTextTransformer()),
    ('vectorizer', vectorizer)
     ])

In [444]:
# We're going to take this all the way to a term frequency matrix
blurb1 = """Here is a sample text. Hello. Is there anybody out there. Just nod if you 
can hear me. Is there anyone home? C'mon, now, I hear you're feeling down. I can ease your
pain and get you on your feet again. Relax, it's just a little pin prick. That'll keep you 
going through the show. C'mon it's time to go."""

blurb2 = """When we grew up and went to school, there were certain teachers who would hurt
the children any way they could. By pouring their derision upon everything we did, exposing
every weakness, however carefully hidden by the kids. But it was known when they got home at
night their fat and psychopathic wives would thrash them within inches of their lives.
We don't need no education. We don't need no thought control. No dark sarcasm
the classroom. Teacher leave them kids alone.
"""
blurb3 = """The itsy-bitsy spider went up the waterspout. Down came the rain and washed
the spider out. Out came the sun and dried up all the rain, and the itsy-bitsy spider went
up the spout again.
"""
blurbs = [blurb1, blurb2, blurb3]

In [244]:
pipe.fit(samp_post)

Pipeline(steps=[('cleanText', <loseit_nlp.CleanTextTransformer object at 0x128871e90>), ('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df...n=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenizeText at 0x11ad12de8>, vocabulary=None))])

In [276]:
M = pipe.transform(samp_post).todense().T

In [300]:
# First, clean it...
blurb_clean = cleanText(blurb).decode('utf-8')
blurb_clean

u"here is a sample text. hello. is there anybody out there. just nod if you  can hear me. is there anyone home? c'mon, now, i hear you're feeling down. i can ease your pain and get you on your feet again. relax, it's just a little pin prick. that will keep you  going through the show. c'mon it's time to go."

In [301]:
# now, tokenize...
blurb_bows = tokenizeText(blurb_clean)
blurb_bows

In [445]:
# now, from there to a term frequency matrix, right? 
# Do we use CountVectorizer? That takes in a series of texts, correct? Here, I only have one.
veczr = CountVectorizer(tokenizer=tokenizeText)
veczr.fit(blurbs)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenizeText at 0x11ad12938>, vocabulary=None)

In [446]:
# how do I see the vocabulary?
veczr.vocabulary_

{u'?': 0,
 u'anybody': 1,
 u'bitsy': 2,
 u"c'mon": 3,
 u'carefully': 4,
 u'certain': 5,
 u'child': 6,
 u'classroom': 7,
 u'come': 8,
 u'control': 9,
 u'dark': 10,
 u'derision': 11,
 u'dry': 12,
 u'ease': 13,
 u'education': 14,
 u'expose': 15,
 u'fat': 16,
 u'feel': 17,
 u'foot': 18,
 u'grow': 19,
 u'hear': 20,
 u'hello': 21,
 u'hide': 22,
 u'home': 23,
 u'hurt': 24,
 u'inch': 25,
 u'itsy': 26,
 u'kid': 27,
 u'know': 28,
 u'leave': 29,
 u'life': 30,
 u'little': 31,
 u'need': 32,
 u'night': 33,
 u'nod': 34,
 u'pain': 35,
 u'pin': 36,
 u'pour': 37,
 u'prick': 38,
 u'psychopathic': 39,
 u'rain': 40,
 u'relax': 41,
 u'sample': 42,
 u'sarcasm': 43,
 u'school': 44,
 u'spider': 45,
 u'spout': 46,
 u'sun': 47,
 u'teacher': 48,
 u'text': 49,
 u'thought': 50,
 u'thrash': 51,
 u'time': 52,
 u'wash': 53,
 u'waterspout': 54,
 u'way': 55,
 u'weakness': 56,
 u'wife': 57}

In [447]:
# how do I get the countvector?
M = veczr.transform(blurbs).todense()
M

matrix([[1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, 1,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
         1, 1, 1, 1, 0, 2, 1, 1, 1, 0, 2, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
         1, 0, 0, 0, 2, 0, 1, 1, 0, 0, 0, 1, 1, 1],
        [0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
         0, 3, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]])

In [448]:
# or can we just skip all that an go straight to the TfidfVectorizer?
tfidfer = TfidfVectorizer(tokenizer=tokenizeText, 
                          min_df=0.3, 
                          stop_words=STOPLIST,
                          decode_error='ignore',
                          norm=None)

In [449]:
# can skip this step with fit_transform method, but here we can examine the vocab
# with fitz.vocabulary_ 
fitz = tfidfer.fit(blurbs)

In [450]:
M = np.array(fitz.transform(blurbs).todense())

In [451]:
# and here is our tfidf matrix!!
M

array([[ 1.69314718,  1.69314718,  0.        ,  3.38629436,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.69314718,  0.        ,
         0.        ,  0.        ,  1.69314718,  1.69314718,  0.        ,
         3.38629436,  1.69314718,  0.        ,  1.28768207,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.69314718,  0.        ,  0.        ,  1.69314718,
         1.69314718,  1.69314718,  0.        ,  1.69314718,  0.        ,
         0.        ,  1.69314718,  1.69314718,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.69314718,
         0.        ,  0.        ,  1.69314718,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.69314718,
         1.69314718,  1.69314718,  1.69314718,  0.        ,  1.69314718,
   

In [457]:
# Just verify that these documents, though normed, have different vectors (unit vectors)
# Shit! I had to non-normalize the vectorizer to get this to work.
for i, row in enumerate(M):
    print "Vector norm of blurb_{}: {}".format(i+1, np.linalg.norm(M[i]))

Vector norm of blurb_1: 8.2215153558
Vector norm of blurb_2: 10.6518201612
Vector norm of blurb_3: 9.27374903969


In [460]:
# Okay, here's the numerator of the cos-sim formula
cos_numerator = M.dot(M.T)
cos_numerator

array([[  67.59331475,    1.65812512,    0.        ],
       [   1.65812512,  113.46127275,    0.        ],
       [   0.        ,    0.        ,   86.00242125]])

In [492]:
# How do I get the denominator
denom = np.linalg.norm(M, axis=1)*np.linalg.norm(M.T, axis=0)

In [493]:
denom

array([  67.59331475,  113.46127275,   86.00242125])

In [488]:
from itertools import permutations

In [491]:
for t in permutations(g, 2):
    print t

(8.2215153557960257, 10.651820161183689)
(8.2215153557960257, 9.2737490396895446)
(10.651820161183689, 8.2215153557960257)
(10.651820161183689, 9.2737490396895446)
(9.2737490396895446, 8.2215153557960257)
(9.2737490396895446, 10.651820161183689)


In [479]:
# Our cosin similarity matrix!!! Am I going to need it?
cos_numerator / denom

array([[ 1.        ,  0.01461402,  0.        ],
       [ 0.0245309 ,  1.        ,  0.        ],
       [ 0.        ,  0.        ,  1.        ]])

In [454]:
fac = np.array([[2,2,5], [1,12,7]])

In [455]:
fac*fac

array([[  4,   4,  25],
       [  1, 144,  49]])

In [418]:
rows = fac.shape[0]
fac / np.sqrt((fac*fac).sum(axis=1)).reshape(rows, 1)

array([[ 0.34815531,  0.34815531,  0.87038828],
       [ 0.07179582,  0.86154979,  0.50257071]])

In [439]:
M.shape, fac.shape

((2, 48), (2, 3))

In [440]:
(M*M).sum(axis=1)

array([ 46.43263991,  78.03795463])

In [441]:
# next, Mike wants us to normalize this matrix...
rows = M.shape[0]
cols = M.shape[1]
M_normed = M / np.sqrt((M*M).sum(axis=1)).reshape(rows, 1)

In [442]:
M_normed

array([[ 0.20625685,  0.20625685,  0.4125137 ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.20625685,  0.        ,  0.        ,  0.        ,  0.20625685,
         0.20625685,  0.        ,  0.4125137 ,  0.20625685,  0.        ,
         0.14675345,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.20625685,  0.        ,  0.        ,
         0.20625685,  0.20625685,  0.20625685,  0.        ,  0.20625685,
         0.        ,  0.20625685,  0.20625685,  0.        ,  0.        ,
         0.        ,  0.20625685,  0.        ,  0.        ,  0.20625685,
         0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.15909888,  0.15909888,
         0.15909888,  0.15909888,  0.15909888,  0.15909888,  0.15909888,
         0.        ,  0.15909888,  0.15909888,  0.15909888,  0.        ,
         0.        ,  0.15909888,  0.        ,  0.        ,  0.15909888,
   

In [443]:
M_normed.dot(M_normed.T)

array([[ 1.        ,  0.01661251],
       [ 0.01661251,  1.        ]])

In [None]:


# following this tutorial: https://spacy.io/docs/usage/language-processing-pipeline
nlp = spacy.load('en')

In [109]:
bows = [nlp.tokenizer(post[1]) for post in wagon_thread] # A lst of spaCy docs

In [110]:
bow = bows[0]

In [125]:
# This parses, tokenizes and tags the set of texts, yeilding spaCy docs
# for each one in order
bows2 = nlp.pipe(wagon_thread)

In [114]:
import itertools

In [126]:
postsbows = []
for bow in bows2:
    postsbows.append(bow)

In [144]:
spec_1 = postsbows[0]
spec_1

I'm one of THOSE people - I get all jazzed up about losing weight and exercising, I get my plan all set in place (I love lists and charts), then when it comes to actually DOING it, after about a day, 3 days, a week, I fail. I'll have a mountain dew, I'll skip a workout, I'll order a burger instead of fish, etc.

 Now, failure is a part of life. These things happen. Intellectually, I understand this. Emotionally, I do not. As soon as I "screw up", I (figuratively) throw my hands in the air and give up on it all. Suffering from perfectionism, I see this one black mark and decide that everything is ruined, I'm a failure, I'll never achieve my goals - and therefore eat whatever I want, whenever I want, and sit on the couch, binge-watching TV on Netflix in order to escape the feelings of despair, disappointment, and helplessness that come with failure.

 I need advice - how do you accept failure? How do you forgive yourself when you "screw up"? How do you pick yourself back up and get back 

In [147]:
tks = spec_1[125:150]

In [148]:
for tk in tks:
    print "Token: {}, POS: {}".format(tk.orth_, tk.pos_)

Token: air, POS: NOUN
Token: and, POS: CCONJ
Token: give, POS: VERB
Token: up, POS: PART
Token: on, POS: ADP
Token: it, POS: PRON
Token: all, POS: DET
Token: ., POS: PUNCT
Token: Suffering, POS: VERB
Token: from, POS: ADP
Token: perfectionism, POS: NOUN
Token: ,, POS: PUNCT
Token: I, POS: PRON
Token: see, POS: VERB
Token: this, POS: DET
Token: one, POS: NUM
Token: black, POS: ADJ
Token: mark, POS: NOUN
Token: and, POS: CCONJ
Token: decide, POS: VERB
Token: that, POS: ADP
Token: everything, POS: NOUN
Token: is, POS: VERB
Token: ruined, POS: VERB
Token: ,, POS: PUNCT


In [213]:
# Tokenizes a unicode text sample
parser = English
parsedData = parser()
x = parsedData(specimen_1)
x

I'm one of THOSE people - I get all jazzed up about losing weight and exercising, I get my plan all set in place (I love lists and charts), then when it comes to actually DOING it, after about a day, 3 days, a week, I fail. I'll have a mountain dew, I'll skip a workout, I'll order a burger instead of fish, etc.

 Now, failure is a part of life. These things happen. Intellectually, I understand this. Emotionally, I do not. As soon as I "screw up", I (figuratively) throw my hands in the air and give up on it all. Suffering from perfectionism, I see this one black mark and decide that everything is ruined, I'm a failure, I'll never achieve my goals - and therefore eat whatever I want, whenever I want, and sit on the couch, binge-watching TV on Netflix in order to escape the feelings of despair, disappointment, and helplessness that come with failure.

 I need advice - how do you accept failure? How do you forgive yourself when you "screw up"? How do you pick yourself back up and get back 

In [78]:
for i, token in enumerate(x[:10]):
    print "{}: {}".format(token.orth_, token.pos_)


I: PRON
'm: VERB
one: NUM
of: ADP
THOSE: DET
people: NOUN
-: PUNCT
I: PRON
get: VERB
all: DET


In [80]:
sents = []
for span in x.sents:
    sent = ''.join(x[i].string for i in range(span.start, span.end)).strip()
    sents.append(sent)
    
for sentence in sents:
    print sentence + '\n'

I'm one of THOSE people - I get all jazzed up about losing weight and exercising, I get my plan all set in place (I love lists and charts), then when it comes to actually DOING it, after about a day, 3 days, a week, I fail.

I'll have a mountain dew, I'll skip a workout, I'll order a burger instead of fish, etc.

Now, failure is a part of life.

These things happen.

Intellectually, I understand this.

Emotionally, I do not.

As soon as I "screw up", I (figuratively) throw my hands in the air and give up on it all.

Suffering from perfectionism, I see this one black mark and decide that everything is ruined, I'm a failure, I'll never achieve my goals - and therefore eat whatever I want, whenever I want, and sit on the couch, binge-watching TV on Netflix in order to escape the feelings of despair, disappointment, and helplessness that come with failure.

I need advice - how do you accept failure?

How do you forgive yourself when you "screw up"?

How do you pick yourself back up and get

In [158]:
punc = string.punctuation
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [160]:
punc.split(' ').pop(0)

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [165]:
samples[0]

('36492-1',
 36492,
 'Accepting Failure - Falling Off the Band Wagon',
 'Rachael',
 11875157,
 '06/17/2014',
 'I\'m one of THOSE people - I get all jazzed up about losing weight and exercising, I get my plan all set in place (I love lists and charts), then when it comes to actually DOING it, after about a day, 3 days, a week, I fail. I\'ll have a mountain dew, I\'ll skip a workout, I\'ll order a burger instead of fish, etc.\n\n Now, failure is a part of life. These things happen. Intellectually, I understand this. Emotionally, I do not. As soon as I "screw up", I (figuratively) throw my hands in the air and give up on it all. Suffering from perfectionism, I see this one black mark and decide that everything is ruined, I\'m a failure, I\'ll never achieve my goals - and therefore eat whatever I want, whenever I want, and sit on the couch, binge-watching TV on Netflix in order to escape the feelings of despair, disappointment, and helplessness that come with failure.\n\n I need advice - h

In [15]:
for i, sample in enumerate(samples):
    print "Sample Text: {}".format(i+1)
    print sample[6] + '\n\n'


Sample Text: 1
I'm one of THOSE people - I get all jazzed up about losing weight and exercising, I get my plan all set in place (I love lists and charts), then when it comes to actually DOING it, after about a day, 3 days, a week, I fail. I'll have a mountain dew, I'll skip a workout, I'll order a burger instead of fish, etc.

 Now, failure is a part of life. These things happen. Intellectually, I understand this. Emotionally, I do not. As soon as I "screw up", I (figuratively) throw my hands in the air and give up on it all. Suffering from perfectionism, I see this one black mark and decide that everything is ruined, I'm a failure, I'll never achieve my goals - and therefore eat whatever I want, whenever I want, and sit on the couch, binge-watching TV on Netflix in order to escape the feelings of despair, disappointment, and helplessness that come with failure.

 I need advice - how do you accept failure? How do you forgive yourself when you "screw up"? How do you pick yourself back u