<a href="https://colab.research.google.com/github/mutherr/CS6120-PS1/blob/master/PS1_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import json
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,KFold,LeaveOneOut
from sklearn.preprocessing import scale
import numpy as np

In [0]:
#read in the movie review corpus
def readReviews():
  raw = requests.get("https://raw.githubusercontent.com/mutherr/CS6120-PS1-data/master/cornell_reviews.json").text.strip()
  corpus = [json.loads(line) for line in raw.split("\n")]
  return corpus

In [0]:
#Here is where you will featurize the data.
#NB: The current contents are for testing only
#This function should return: 
#  -a numpy matrix of document features
#  -a list of the correct class for each document
#  -a list of the vocabulary used by the features, such that the ith term of the
#    list is the word whose counts appear in the ith column of the matrix. 
def createFeatures(corpus):
  from sklearn.feature_extraction.text import CountVectorizer
  import string

  texts = [entry["text"] for entry in corpus]
  genres = [entry["class"] for entry in corpus]

  vectorizer = CountVectorizer()
  texts = vectorizer.fit_transform(texts).todense()
  vocab = vectorizer.get_feature_names()

  return texts,genres,vocab

In [0]:
#given a numpy matrix representation of the features for the training set, the 
# vector of true classes for each example, and the vocabulary as described 
# above, this computes the accuracy of the model using leave 10-fold cv
# validation and reports the most indicative features for each class
def evaluateModel(X,y,vocab,penalty="l1"):
  X = scale(X)
  #create and fit the model
  #This model literally will not fit to this data. The liblinear solver doesn't 
  # even try to update its parameters at all, saga and lbfgs run out of
  # iterations and learn nothing. Even when I use a smaller dataset (10%) of the
  # data (the same url but with cornell_reviews_small.json), it doesn't work.
  model = LogisticRegression(penalty=penalty,max_iter=1000,solver="saga",verbose=True)
  results = cross_validate(model,X,y,cv=KFold(n_splits=5,shuffle=True))

  #determine the average accuracy
  scores = results["test_score"]
  avg_score = sum(scores)/len(scores)
  print(scores)

  #determine the most informative features
  # this requires us to fit the model to everything, because we need a
  # single model to draw coefficients from, rather than 5
  model.fit(X,y)
  print(model.classes_)
  print(X.shape)
  print(model.coef_[0])
  print(model.n_iter_)
  neg_class_prob_sorted = model.coef_[0, :].argsort()
  pos_class_prob_sorted = (-model.coef_[0, :]).argsort()
  
  termsToTake = 20
  pos_indicators = [vocab[i] for i in neg_class_prob_sorted[:termsToTake]]
  neg_indicators = [vocab[i] for i in pos_class_prob_sorted[:termsToTake]]

  return avg_score,pos_indicators,neg_indicators

def evaluateModelL2(X,y,vocab):
  return evaluateModel(X,y,vocab,penalty="l2")

In [72]:
corpus = readReviews()

X,y,vocab = createFeatures(corpus)

# print("L1 norm")
# avg_score,pos_indicators,neg_indicators = evaluateModel(X,y,vocab)

# print("The model's average accuracy is %f"%avg_score)
# print("The most informative terms for neg are: %s"%pos_indicators)
# print("The most informative terms for pos are: %s"%neg_indicators)

print("L2 norm")
avg_score,pos_indicators,neg_indicators = evaluateModelL2(X,y,vocab)

print("The model's average accuracy is %f"%avg_score)
print("The most informative terms for neg are: %s"%pos_indicators)
print("The most informative terms for pos are: %s"%neg_indicators)

L2 norm


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 6 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 6 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 7 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 6 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 6 seconds
[0.1   0.075 0.1   0.075 0.075]


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 8 seconds
['neg' 'pos']
(200, 9141)
[-4.27271805e-11  8.01476382e-10  9.33485950e-11 ... -3.33940203e-10
 -4.93283427e-11  2.06499432e-10]
[400]
The model's average accuracy is 0.085000
The most informative terms for neg are: ['find', 'add', 'camera', 'lawyer', 'total', 'erotic', 'friend', 'named', 'futile', 'homage', 'runs', 'open', 'jeff', 'below', 'jobs', 'maybe', 'realize', 'vengeance', 'band', 'interesting']
The most informative terms for pos are: ['redundant', 'shelves', 'proceeds', 'door', 'derivative', 'implied', 'jay', 'obsesses', 'receives', 'maryam', 'stalk', 'typically', 'inexpensive', 'stake', 'interspersed', 'gleason', 'proliferation', 'guesswork', 'narrator', 'stalkers']


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.8s finished
