# nlp_q3.ipynb Bigram Model

Review Input:
BigramModel 
{u'text': u"Love it!!!!! Love it!!!!!! love it!!!!!!!   Who doesn't love Culver's!"}
BigramModel 
{u'text': u'Everything was great except for the burgers they are greasy and very charred compared to other stores.'}
BigramModel 
{u'text': u'I really like both Chinese restaurants in town.  This one has outstanding crab rangoon.  Love the chicken with snow peas and mushrooms and General Tso Chicken.  Food is always ready in 10 minutes which is accurate.  Good place and they give you free pop.'}
BigramModel 
{u'text': u'Above average takeout with friendly staff. The sauce on the pan fried noodle is tasty. Dumplings are quite good.'}
BigramModel 
{u'text': u"We order from Chang Jiang often and have never been disappointed.  The menu is huge, and can accomodate anyone's taste buds.  The service is quick, usually ready in 10 minutes."}


In [1]:
import simplejson as json
import random

In [44]:
lst_reviews = []
lst_stars = []
lst_tst = []

# build feature lists from random rows
with open("/home/vagrant/miniprojects/questions/3_week/nlp/data/tiny_review.json") as f:
    for line in f:
        tmp = json.loads(line)
        lst_reviews.append(tmp['text'])
        lst_stars.append(tmp['stars'])
        
# reviews come out tokenized- each review is 1 string, but not
# vectorized
y = lst_stars
print len(lst_stars), "reviews"


3000 reviews


In [None]:
## Define stopwords ###

# import nltk
# stops = nltk.corpus.stopwords.words('english')

# nltk corpus may not be available on heroku:
stops = [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now']

In [34]:
## only need if the input is not tokenized
# bigram, lengths 1 and 2 with hashing vectorizer
import nltk.tokenize
from sklearn.feature_extraction.text import HashingVectorizer

parameters = {'input':lst_reviews, 'decode_error':'ignore',
              'ngram_range':(1,2), 'non_negative':True, 'n_features':100000,
              'stop_words':stops}
ng_counter = HashingVectorizer(**parameters)

counts=ng_counter.fit_transform(lst_reviews)

X = counts
print counts.shape

# __ reviews, ___ unique words

<type 'numpy.ndarray'> <class 'scipy.sparse.csr.csr_matrix'>
(3000, 100000)


In [None]:
## Cross Validate ###
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.25)

print X_train.shape, len(y_train)
print X_test.shape, len(y_test)


In [36]:
# truncate this massive sparse matrix by SVD
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100)

X_trn_tiny = svd.fit_transform(X_trn)

In [37]:
print type(X_trn_tiny), X_trn_tiny.shape
print X_trn_tiny[0,0]

<type 'numpy.ndarray'> (3000, 100)
0.0829893138809


In [38]:
# train the classifier
from sklearn.linear_model import LinearRegression
lnr = LinearRegression().fit(X_trn_tiny, lst_stars)

In [39]:
# save the model
from sklearn.externals import joblib

joblib.dump(lnr,'/home/vagrant/miniprojects/questions/3_week/nlp/pickles/q2_model.pkl')

['/home/vagrant/miniprojects/questions/3_week/nlp/pickles/q2_model.pkl',
 '/home/vagrant/miniprojects/questions/3_week/nlp/pickles/q2_model.pkl_01.npy',
 '/home/vagrant/miniprojects/questions/3_week/nlp/pickles/q2_model.pkl_02.npy']

In [47]:
X_tst = lst_tst
X_tst = ng_counter.fit_transform(X_tst).toarray()

print X_tst.shape

(1000, 100000)


In [48]:
# if this was raw strings, it would need to be hash vectorized
# since it's already in this form, we can svd it, then run predict
X_tst_tiny = svd.transform(X_tst)

print X_tst_tiny.shape

(1000, 100)


In [49]:
p = lnr.predict(X_tst_tiny)

In [50]:
print p

[ 2.87064916  3.25740874  3.21098448  3.58275315  4.15341595  2.17057596
  3.43758947  2.46798879  3.59993311  2.04720853  4.85214216  3.69692738
  2.41675093  3.52719658  2.79235823  2.84885614  3.86368597  3.49994699
  3.79819504  3.77745403  2.87364152  4.31719149  3.07291298  3.5758968
  4.96167515  2.79290435  1.89914632  3.542675    4.99239213  3.84442084
  2.43657226  3.72134614  3.73407898  2.87970679  4.06141634  3.24574102
  2.8887286   4.07996691  4.31824134  1.84470265  3.04631997  3.41363337
  4.51585433  4.56391276  4.26518821  3.52655678  3.28357723  4.38392247
  4.26349294  4.46233032  2.62094397  3.12754667  2.84641709  4.31719149
  3.30855275  3.65357261  3.71242116  3.05625629  3.6277285   3.05013919
  4.02434239  3.7433118   3.24103483  3.5758968   4.56101521  4.86162379
  3.54489123  4.25206442  3.35128282  2.84954176  2.88977188  4.13585638
  4.02451749  3.89241406  2.05482908  3.37966117  2.77152674  2.8414837
  3.22012299  2.91927728  3.80895915  2.71910234  4.0