In [None]:
import os
import numpy as np
import pandas as pd
import sklearn.linear_model as sklm
import sklearn.pipeline
import sklearn.model_selection as skms
import sklearn.feature_selection 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
import nltk

# Import our filess
from load_train_data import load_data 


In [None]:
website_list, review_list, rating_list = load_data('x_train.csv', 'y_train.csv')

numReviewsTotal = len(website_list)

random_state = np.random.RandomState(0)
dataZip = list(zip(website_list, review_list, rating_list))
random_state.shuffle(dataZip)
website_list, review_list, rating_list = zip(*dataZip)

numReviewsTrain = int(numReviewsTotal * 0.8)

website_TR = website_list[:numReviewsTrain]
review_TR = review_list[:numReviewsTrain]
rating_TR = rating_list[:numReviewsTrain]

website_TE = website_list[numReviewsTrain:]
review_TE = review_list[numReviewsTrain:]
rating_TE = rating_list[numReviewsTrain:]


In [None]:
logistic = sklm.LogisticRegression(solver='liblinear', max_iter=1000)
distributions = dict(C=np.logspace(-9,6,31), penalty = ['l2', 'l1'])

#Pipeline starts!
my_bow_classifier_pipeline1 = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,1))),
    ('cross validation', skms.RandomizedSearchCV(logistic, distributions, n_iter=100, cv=10, verbose=0, random_state=0, error_score='raise', return_train_score=True))
])

my_bow_classifier_pipeline1.fit(review_TR, rating_TR)
my_bow_classifier_pipeline1.predict(review_TR)
my_bow_classifier_pipeline1.score(review_TR, rating_TR)
probs1TR = my_bow_classifier_pipeline1.predict_proba(review_TR)
probs1TE = my_bow_classifier_pipeline1.predict_proba(review_TE)

#Pipeline starts!
my_bow_classifier_pipeline2 = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,2))),
    ('cross validation', skms.RandomizedSearchCV(logistic, distributions, n_iter=100, cv=10, verbose=0, random_state=0, error_score='raise', return_train_score=True))
])

my_bow_classifier_pipeline2.fit(review_TR, rating_TR)
my_bow_classifier_pipeline2.predict(review_TR)
my_bow_classifier_pipeline2.score(review_TR, rating_TR)
probs2TR = my_bow_classifier_pipeline2.predict_proba(review_TR)
probs2TE = my_bow_classifier_pipeline2.predict_proba(review_TE)

#Pipeline starts!
my_bow_classifier_pipeline3 = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,3))),
    ('cross validation', skms.RandomizedSearchCV(logistic, distributions, n_iter=100, cv=10, verbose=0, random_state=0, error_score='raise', return_train_score=True))
])

my_bow_classifier_pipeline3.fit(review_TR, rating_TR)
my_bow_classifier_pipeline3.predict(review_TR)
my_bow_classifier_pipeline3.score(review_TR, rating_TR)
probs3TR = my_bow_classifier_pipeline3.predict_proba(review_TR)
probs3TE = my_bow_classifier_pipeline3.predict_proba(review_TE)

In [None]:
weights1 = my_bow_classifier_pipeline1['cross validation'].best_estimator_.coef_

#getting CountVectorizer dictionary
dictionary1 = my_bow_classifier_pipeline1['my_bow_feature_extractor'].vocabulary_

print(my_bow_classifier_pipeline1['cross validation'].best_params_)

acc1TR = roc_auc_score(rating_TR, probs1TR[:,1])
print("Training accuracy: %.3f" % acc1TR)
acc1TE = roc_auc_score(rating_TE, probs1TE[:,1])
print("Training accuracy: %.3f" % acc1TE)


weights2 = my_bow_classifier_pipeline2['cross validation'].best_estimator_.coef_

#getting CountVectorizer dictionary
dictionary2 = my_bow_classifier_pipeline2['my_bow_feature_extractor'].vocabulary_

print(my_bow_classifier_pipeline2['cross validation'].best_params_)

acc2TR = roc_auc_score(rating_TR, probs2TR[:,1])
print("Training accuracy: %.3f" % acc2TR)
acc2TE = roc_auc_score(rating_TE, probs2TE[:,1])
print("Training accuracy: %.3f" % acc2TE)



weights3 = my_bow_classifier_pipeline3['cross validation'].best_estimator_.coef_

#getting CountVectorizer dictionary
dictionary3 = my_bow_classifier_pipeline3['my_bow_feature_extractor'].vocabulary_

print(my_bow_classifier_pipeline3['cross validation'].best_params_)

acc3TR = roc_auc_score(rating_TR, probs3TR[:,1])
print("Training accuracy: %.3f" % acc3TR)
acc3TE = roc_auc_score(rating_TE, probs3TE[:,1])
print("Training accuracy: %.3f" % acc3TE)

# Results - random state isn't currently consistent.
Evaluation of 1, 2 and 3-ngram performance applied to a random 80% of the training data, evaluated against that after 10 fold CV and the 20% heldout test data.

```python
{'penalty': 'l2', 'C': 31.622776601683793}
Training accuracy: 1.000
Training accuracy: 0.858
{'penalty': 'l1', 'C': 31622.776601683792}
Training accuracy: 1.000
Training accuracy: 0.885
{'penalty': 'l1', 'C': 3162.2776601683795}
Training accuracy: 1.000
Training accuracy: 0.884
```


Attempt 2:
```python
{'penalty': 'l2', 'C': 31.622776601683793}
Training accuracy: 1.000
Training accuracy: 0.858
{'penalty': 'l1', 'C': 3162.2776601683795}
Training accuracy: 1.000
Training accuracy: 0.888
{'penalty': 'l1', 'C': 1000.0}
Training accuracy: 1.000
Training accuracy: 0.890
```

Evaluation of 1, 2 and 3-ngram performance applied to a random 80% of the training data, evaluated against that after 5 fold CV and the 20% heldout test data.

```python
{'penalty': 'l2', 'C': 31.622776601683793}
Training accuracy: 1.000
Training accuracy: 0.858
{'penalty': 'l1', 'C': 10000.0}
Training accuracy: 1.000
Training accuracy: 0.883
{'penalty': 'l1', 'C': 1000.0}
Training accuracy: 1.000
Training accuracy: 0.885
```

In [None]:
x_te_data = 'x_test.csv'
data_dir = 'data_reviews'
x_te_df = pd.read_csv(os.path.join(data_dir, x_te_data))
te_website_list = x_te_df['website_name'].values.tolist()
te_text_list = x_te_df['text'].values.tolist()


np.savetxt('q2_1ngram.txt', probs1TE, fmt='%s')
np.savetxt('q2_2ngram.txt', probs2TE, fmt='%s')
np.savetxt('q2_3ngram.txt', probs3TE, fmt='%s')
