In [4]:
import os
import numpy as np
import pandas as pd
import sklearn.linear_model as sklm
import sklearn.pipeline
import sklearn.model_selection as skms
import sklearn.feature_selection 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
# Import our filess
from load_train_data import load_data 


Loading data from csv files

In [5]:
website_list, review_list, rating_list = load_data('x_train.csv', 'y_train.csv')

# PROBLEM 1:
Defining pipeline for predicting and fitting:

In [6]:
logistic = sklm.LogisticRegression(solver='liblinear', max_iter=100)
distributions = dict(C=np.logspace(-9,6,31), penalty = ['l2', 'l1'])

#Pipeline starts!
my_bow_classifier_pipeline = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,1))),
    ('cross validation', skms.RandomizedSearchCV(logistic, distributions, n_iter=100, cv=10, verbose=0, random_state=0, error_score='raise', return_train_score=True))
])

my_bow_classifier_pipeline.fit(review_list, rating_list)
my_bow_classifier_pipeline.predict(review_list)
my_bow_classifier_pipeline.score(review_list, rating_list)
probs = my_bow_classifier_pipeline.predict_proba(review_list)



In [7]:
weights = my_bow_classifier_pipeline['cross validation'].best_estimator_.coef_

#getting CountVectorizer dictionary
dictionary = my_bow_classifier_pipeline['my_bow_feature_extractor'].vocabulary_

print(my_bow_classifier_pipeline['cross validation'].best_params_)

acc = roc_auc_score(rating_list, probs[:, 1])
print("Training accuracy: %.3f" % acc)

{'penalty': 'l2', 'C': 1.0}
Training accuracy: 0.994


Getting 0.89106 for Gradescope submission!

In [8]:
x_te_data = 'x_test.csv'
data_dir = 'data_reviews'
x_te_df = pd.read_csv(os.path.join(data_dir, x_te_data))
te_website_list = x_te_df['website_name'].values.tolist()
te_text_list = x_te_df['text'].values.tolist()

probs = my_bow_classifier_pipeline.predict_proba(te_text_list)[:, 1]
print(probs)

np.savetxt('q1.txt', probs, fmt='%s')


[0.53394516 0.4760095  0.10120415 0.05030966 0.32383209 0.05893481
 0.04369099 0.20793626 0.23058191 0.47927785 0.40941855 0.61267748
 0.06951167 0.11076701 0.25786707 0.05449893 0.00735609 0.14522679
 0.38067303 0.63967442 0.44020526 0.46111084 0.26931365 0.24878015
 0.42375413 0.39155014 0.21319605 0.47269212 0.20008731 0.25979086
 0.54861097 0.22845893 0.38094212 0.00490693 0.33457314 0.35640788
 0.26650577 0.14854029 0.41721891 0.44249259 0.09374761 0.09317715
 0.51424534 0.21361626 0.19960758 0.03375936 0.23507921 0.05202372
 0.01614569 0.20008731 0.12203483 0.15485532 0.88203374 0.3400139
 0.32262402 0.22818082 0.15380822 0.04289929 0.66804447 0.10089488
 0.46964958 0.1109564  0.13282705 0.09016455 0.05959207 0.05211388
 0.0391277  0.61634297 0.06944637 0.51318222 0.0735873  0.44880093
 0.06685663 0.01573909 0.28335287 0.33613814 0.24898975 0.03503635
 0.54091134 0.1640306  0.20943812 0.0224517  0.15591127 0.3041604
 0.0098755  0.31128656 0.13127829 0.05961275 0.29768897 0.048148

# Problem 2

In [38]:
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer

#punkt and stopwords downloaded

Tokenizing words again, but using nltk this time.

In [48]:
tokenz = list()
punc = ['.', '..', '...', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}','%']

for reviewIdx in range(len(review_list)):
    cur_token = review_list[reviewIdx] # cur_token is 1 review
    for word in cur_token:
        cur_word = word_tokenize(word)
        if cur_word not in stopwords.words('english'):
            tokenz.append(word.lower())

for word in tokenz:
    if word in punc:
        tokenz.remove(word)


['oh', 'forgot', 'also', 'mention', 'weird', 'color', 'effect', 'phone', 'one', "n't", 'work', 'either', 'waste', '13', 'bucks', 'product', 'useless', 'since', 'enough', 'charging', 'current', 'charge', '2', 'cellphones', 'planning', 'use', 'none', 'three', 'sizes', 'sent', 'headset', 'would', 'stay', 'ears', 'worst', 'customer', 'service', 'ngage', 'still', 'lacking', 'earbuds', 'always', 'cuts', 'makes', 'beep', 'beep', 'beep', 'sound', 'says', 'signal', 'failed', 'disappointing', 'thing', 'speakerphone', 'disappointed', 'accessoryone', 'basically', 'service', 'bad', 'bad', 'choice', 'thing', 'disappoint', 'infra', 'red', 'port', 'irda', 'horrible', 'switch', '3', 'times', 'feels', 'poorly', 'constructed', 'menus', 'difficult', 'navigate', 'buttons', 'recessed', 'difficult', 'push', "n't", 'make', 'mistake', 'muddy', 'low', 'quality', 'sound', 'casing', 'around', 'wire', "'s", 'insert', 'poorly', 'super', 'glued', 'slid', 'advise', 'everyone', 'fooled']


In [49]:
print(tokenz[0:100])
#print(stopwords.words('english')[0:10])

#print(cur_word in stopwords.words('english'))

['oh', 'forgot', 'also', 'mention', 'weird', 'color', 'effect', 'phone', 'one', "n't", 'work', 'either', 'waste', '13', 'bucks', 'product', 'useless', 'since', 'enough', 'charging', 'current', 'charge', '2', 'cellphones', 'planning', 'use', 'none', 'three', 'sizes', 'sent', 'headset', 'would', 'stay', 'ears', 'worst', 'customer', 'service', 'ngage', 'still', 'lacking', 'earbuds', 'always', 'cuts', 'makes', 'beep', 'beep', 'beep', 'sound', 'says', 'signal', 'failed', 'disappointing', 'thing', 'speakerphone', 'disappointed', 'accessoryone', 'basically', 'service', 'bad', 'bad', 'choice', 'thing', 'disappoint', 'infra', 'red', 'port', 'irda', 'horrible', 'switch', '3', 'times', 'feels', 'poorly', 'constructed', 'menus', 'difficult', 'navigate', 'buttons', 'recessed', 'difficult', 'push', "n't", 'make', 'mistake', 'muddy', 'low', 'quality', 'sound', 'casing', 'around', 'wire', "'s", 'insert', 'poorly', 'super', 'glued', 'slid', 'advise', 'everyone', 'fooled']
