Import libraries

In [21]:
import os
import numpy as np
import pandas as pd
import sklearn.linear_model as sklm
import sklearn.pipeline
import sklearn.model_selection as skms
import sklearn.feature_selection 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform

# Import our filess
from load_train_data import load_data 
from clean_data import tokenize_text, create_word_list, create_dict, make_feature_vector

In [22]:
website_list, review_list, rating_list = load_data('x_train.csv', 'y_train.csv')

tokens = tokenize_text(review_list) 

word_count_dict = create_word_list(tokens) # dictionary of word counts
sorted_tokens = list(sorted(word_count_dict, key=word_count_dict.get, reverse=True))

# clean up the data INSTEAD of tokens = tokenize_text(review_list)
vectorizer = CountVectorizer(stop_words = 'english', ngram_range=(1,1), min_df=3, max_df=0.08, binary=False) #filter out words over 0.08 occurence
# vectorizer = CountVectorizer(analyzer = 'word',tokenizer=lambda txt: txt.split(),token_pattern = str,ngram_range=(1,1), min_df=0.0, max_df=1.0, binary=False)

vectorizer.build_preprocessor()
vectorizer.build_tokenizer()
X = vectorizer.fit(review_list, rating_list) 

# Save output to text file
with open('wordFreq.txt', 'w') as f:
    f.write(str(word_count_dict))

with open('afterCountVectorizer.txt', 'w') as f:
    # f.write(str(word_count_dict))
    f.write(str(vectorizer.get_feature_names_out().tolist()))


# with open('output.txt', 'r') as f:
#     print(f.read())


In [None]:
#print(vectorizer.get_feature_names_out()[703])
#print(review_list)
# print(vectorizer.get_feature_names_out()[0:100])

In [6]:
sorted_tokens = list(sorted(vectorizer.vocabulary_, key=word_count_dict.get, reverse=True))

print(type(sorted_tokens)) # list of tokens sorted by frequency

# Print all words in list and their frequencies from the dictionary
for w in sorted_tokens:
    print("%5d %s" % (word_count_dict[w], w))
    
# Create a dictionary of the sorted tokens list
vocab_dict = create_dict(sorted_tokens)
print(vocab_dict)



<class 'list'>
 1560 the
  916 and
  707 a
  700 i
  609 is
  542 to
  534 it
  493 of
  493 this
  447 was
  328 in
  257 for
  244 not
  231 that
  212 with
  202 very
  201 my
  183 good
  176 on
  163 you
  162 great
  158 but
  147 have
  143 are
  141 so
  140 movie
  137 phone
  136 as
  119 film
  115 be
  115 all
  111 one
  109 had
  103 at
  100 place
   98 food
   95 like
   92 were
   90 an
   89 just
   86 there
   84 service
   84 time
   83 if
   82 we
   79 bad
   79 really
   78 out
   78 it's
   76 they
   76 from
   75 would
   69 has
   69 about
   68 well
   66 your
   64 only
   63 even
   63 ever
   63 best
   62 by
   62 back
   62 or
   61 don't
   60 -
   59 here
   57 also
   57 will
   56 no
   54 up
   53 go
   52 than
   51 quality
   51 when
   51 love
   50 me
   50 what
   49 can
   49 he
   48 made
   48 more
   47 product
   47 because
   47 excellent
   45 better
   45 which
   44 recommend
   44 some
   42 work
   42 i'm
   42 could
   42 i've
   4

In [None]:
testWords = ['the', 'a', 'and', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
count_V = make_feature_vector(sorted_tokens, vocab_dict)

print(count_V)

In [None]:
# Turning reviews into feature vectors
N = len(review_list)
V = len(sorted_tokens)

x_tr_NV = np.zeros((N,V))   # N x V matrix of feature vectors


for nn, review_line in enumerate(review_list):
    x_tr_NV[nn] = make_feature_vector(review_line, vocab_dict)
# TODO are we using the right inputs and functions here?
print(x_tr_NV.shape)

In [None]:
print(vocab_dict)

# sklearn.model_selection.RandomizedSearchCV
Look for if there is an industry standard for what % of your training data should be the number of iterations and just use that and explain that's what you considered instead of evaluating that you have reached an optimum minima.

In [None]:
# Define a random search cross validation to generate optimum C, penalty and max_iter for a logistic regression classifier
logistic = sklm.LogisticRegression(solver='liblinear', max_iter=300)
distributions = dict(C=np.logspace(-9,-6,100),penalty = ['l2', 'l1'])
randClassifier = skms.RandomizedSearchCV(logistic, distributions, n_iter=100, cv=None, verbose=0, random_state=0, 
                                         error_score='raise', return_train_score=True)

from sklearn.utils import shuffle

# Tried to add shuffle before passing it into the randomizedSearchCV to see if it would improve the fit. It basically didn't - tiny improvement.
x_tr_NV, rating_list = shuffle(x_tr_NV, rating_list, random_state=0)

randClassifier.fit(x_tr_NV, rating_list)
# TODO ensure we're passing the right x and y variables and that they match each other (right ratings with each review) into the classifier.

In [None]:
# Just experimenting to ensure that if I pass 2 arrays in that shuffle shuffles both in the same way
# a = np.arange(18)
# b = np.arange(18)+7

# a, b = shuffle(a, b, random_state=0)

# print(a)
# print(b-7)

In [None]:
# Probability of 0 and 1 [2400,2]
yhat = randClassifier.predict_proba(x_tr_NV)
print(type(yhat))
with open('yhat.txt', 'w') as f:
    f.write(str(yhat.tolist()))

In [None]:
# Evaluate the AUROC accuracy of the classifier on the training data
acc = roc_auc_score(rating_list, yhat[:, 1])
print("Training accuracy: %.3f" % acc)

In [None]:
# make_feature_vector()

In [None]:
# from sklearn.datasets import load_iris
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform
# iris = load_iris()
# logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=0)
# distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
# clf = RandomizedSearchCV(logistic, distributions, random_state=0)
# search = clf.fit(iris.data, iris.target)
# search.best_params_