In [22]:
import os
import numpy as np
import pandas as pd
import sklearn.linear_model as sklm
import sklearn.pipeline
import sklearn.model_selection as skms
import sklearn.feature_selection 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
import nltk

# Import our filess
from load_train_data import load_data 


In [40]:
website_list, review_list, rating_list = load_data('x_train.csv', 'y_train.csv')

numReviewsTotal = len(website_list)

random_state = np.random.RandomState(0)
dataZip = list(zip(website_list, review_list, rating_list))
random_state.shuffle(dataZip)
website_list, review_list, rating_list = zip(*dataZip)

numReviewsTrain = int(numReviewsTotal * 0.8)

website_TR = website_list[:numReviewsTrain]
review_TR = review_list[:numReviewsTrain]
rating_TR = rating_list[:numReviewsTrain]

website_TE = website_list[numReviewsTrain:]
review_TE = review_list[numReviewsTrain:]
rating_TE = rating_list[numReviewsTrain:]


('yelp', 'amazon', 'yelp', 'yelp', 'amazon', 'imdb', 'amazon', 'imdb', 'yelp', 'amazon', 'amazon', 'imdb', 'yelp', 'imdb', 'imdb', 'amazon', 'yelp', 'yelp', 'amazon', 'amazon', 'amazon', 'amazon', 'yelp', 'amazon', 'yelp', 'amazon', 'amazon', 'yelp', 'amazon', 'yelp', 'imdb', 'yelp', 'amazon', 'imdb', 'imdb', 'imdb', 'yelp', 'imdb', 'yelp', 'imdb', 'yelp', 'imdb', 'imdb', 'yelp', 'imdb', 'yelp', 'imdb', 'imdb', 'yelp', 'amazon', 'amazon', 'yelp', 'imdb', 'amazon', 'amazon', 'amazon', 'amazon', 'amazon', 'imdb', 'imdb', 'imdb', 'amazon', 'yelp', 'yelp', 'yelp', 'yelp', 'imdb', 'yelp', 'yelp', 'amazon', 'amazon', 'amazon', 'amazon', 'amazon', 'yelp', 'yelp', 'yelp', 'amazon', 'amazon', 'yelp', 'amazon', 'imdb', 'imdb', 'imdb', 'amazon', 'imdb', 'imdb', 'imdb', 'amazon', 'yelp', 'imdb', 'amazon', 'imdb', 'imdb', 'amazon', 'imdb', 'amazon', 'amazon', 'imdb', 'imdb', 'amazon', 'amazon', 'yelp', 'amazon', 'imdb', 'yelp', 'yelp', 'amazon', 'imdb', 'imdb', 'amazon', 'yelp', 'yelp', 'amazon', '

In [41]:
logistic = sklm.LogisticRegression(solver='liblinear', max_iter=1000)
distributions = dict(C=np.logspace(-9,6,31), penalty = ['l2', 'l1'])

#Pipeline starts!
my_bow_classifier_pipeline1 = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,1))),
    ('cross validation', skms.RandomizedSearchCV(logistic, distributions, n_iter=100, cv=10, verbose=0, random_state=0, error_score='raise', return_train_score=True))
])

my_bow_classifier_pipeline1.fit(review_TR, rating_TR)
my_bow_classifier_pipeline1.predict(review_TR)
my_bow_classifier_pipeline1.score(review_TR, rating_TR)
probs1 = my_bow_classifier_pipeline1.predict_proba(review_TR)
probs1TE = my_bow_classifier_pipeline1.predict_proba(review_TE)

#Pipeline starts!
my_bow_classifier_pipeline2 = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,2))),
    ('cross validation', skms.RandomizedSearchCV(logistic, distributions, n_iter=100, cv=10, verbose=0, random_state=0, error_score='raise', return_train_score=True))
])

my_bow_classifier_pipeline2.fit(review_TR, rating_TR)
my_bow_classifier_pipeline2.predict(review_TR)
my_bow_classifier_pipeline2.score(review_TR, rating_TR)
probs2 = my_bow_classifier_pipeline2.predict_proba(review_TR)
probs2TE = my_bow_classifier_pipeline2.predict_proba(review_TE)

#Pipeline starts!
my_bow_classifier_pipeline3 = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,3))),
    ('cross validation', skms.RandomizedSearchCV(logistic, distributions, n_iter=100, cv=10, verbose=0, random_state=0, error_score='raise', return_train_score=True))
])

my_bow_classifier_pipeline3.fit(review_TR, rating_TR)
my_bow_classifier_pipeline3.predict(review_TR)
my_bow_classifier_pipeline3.score(review_TR, rating_TR)
probs3 = my_bow_classifier_pipeline3.predict_proba(review_TR)
probs3TE = my_bow_classifier_pipeline3.predict_proba(review_TE)



In [43]:
weights1 = my_bow_classifier_pipeline1['cross validation'].best_estimator_.coef_

#getting CountVectorizer dictionary
dictionary1 = my_bow_classifier_pipeline1['my_bow_feature_extractor'].vocabulary_

print(my_bow_classifier_pipeline1['cross validation'].best_params_)

acc1TR = roc_auc_score(rating_TR, probs1[:,1])
print("Training accuracy: %.3f" % acc1TR)
acc1TE = roc_auc_score(rating_TE, probs1TE[:,1])
print("Training accuracy: %.3f" % acc1TE)


weights2 = my_bow_classifier_pipeline2['cross validation'].best_estimator_.coef_

#getting CountVectorizer dictionary
dictionary2 = my_bow_classifier_pipeline2['my_bow_feature_extractor'].vocabulary_

print(my_bow_classifier_pipeline2['cross validation'].best_params_)

acc2TR = roc_auc_score(rating_TR, probs2[:,1])
print("Training accuracy: %.3f" % acc2TR)
acc2TE = roc_auc_score(rating_TE, probs2TE[:,1])
print("Training accuracy: %.3f" % acc2TE)



weights3 = my_bow_classifier_pipeline3['cross validation'].best_estimator_.coef_

#getting CountVectorizer dictionary
dictionary3 = my_bow_classifier_pipeline3['my_bow_feature_extractor'].vocabulary_

print(my_bow_classifier_pipeline3['cross validation'].best_params_)

acc3TR = roc_auc_score(rating_TR, probs3[:,1])
print("Training accuracy: %.3f" % acc1TR)
acc3TE = roc_auc_score(rating_TE, probs3TE[:,1])
print("Training accuracy: %.3f" % acc3TE)

{'penalty': 'l2', 'C': 31.622776601683793}
Training accuracy: 1.000
{'penalty': 'l1', 'C': 1000.0}
Training accuracy: 1.000
{'penalty': 'l1', 'C': 3162.2776601683795}
Training accuracy: 1.000


In [28]:
x_te_data = 'x_test.csv'
data_dir = 'data_reviews'
x_te_df = pd.read_csv(os.path.join(data_dir, x_te_data))
te_website_list = x_te_df['website_name'].values.tolist()
te_text_list = x_te_df['text'].values.tolist()

probs1 = my_bow_classifier_pipeline1.predict(te_text_list)
np.savetxt('q2_1ngram.txt', probs1, fmt='%s')
probs2 = my_bow_classifier_pipeline2.predict(te_text_list)
np.savetxt('q2_2ngram.txt', probs2, fmt='%s')
probs3 = my_bow_classifier_pipeline3.predict(te_text_list)
np.savetxt('q2_3ngram.txt', probs3, fmt='%s')
