In [1]:
import os
import numpy as np
import pandas as pd
import sklearn.linear_model as sklm
import sklearn.pipeline
import sklearn.model_selection as skms
import sklearn.feature_selection 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

# Import our filess
from load_train_data import load_data 


In [2]:
website_list, review_list, rating_list = load_data('x_train.csv', 'y_train.csv')

In [3]:
logistic = sklm.LogisticRegression(solver='liblinear', max_iter=1000)
distributions = dict(C=np.logspace(-9,6,31), penalty = ['l2', 'l1'])

#Pipeline starts!
my_bow_classifier_pipeline = sklearn.pipeline.Pipeline([
    ('my_bow_feature_extractor', CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,1))),
    ('cross validation', skms.RandomizedSearchCV(logistic, distributions, n_iter=100, cv=10, verbose=0, random_state=0, error_score='raise', return_train_score=True))
])

my_bow_classifier_pipeline.fit(review_list, rating_list)
my_bow_classifier_pipeline.predict(review_list)
my_bow_classifier_pipeline.score(review_list, rating_list)
probs = my_bow_classifier_pipeline.predict_proba(review_list)



In [4]:
weights = my_bow_classifier_pipeline['cross validation'].best_estimator_.coef_

#getting CountVectorizer dictionary
dictionary = my_bow_classifier_pipeline['my_bow_feature_extractor'].vocabulary_

print(my_bow_classifier_pipeline['cross validation'].best_params_)

acc = roc_auc_score(rating_list, probs[:,1])
print("Training accuracy: %.3f" % acc)

{'penalty': 'l2', 'C': 1.0}
Training accuracy: 0.994


In [5]:
x_te_data = 'x_test.csv'
data_dir = 'data_reviews'
x_te_df = pd.read_csv(os.path.join(data_dir, x_te_data))
te_website_list = x_te_df['website_name'].values.tolist()
te_text_list = x_te_df['text'].values.tolist()

probs = my_bow_classifier_pipeline.predict_proba(te_text_list)[:, 1]
print(probs)

np.savetxt('q1.txt', probs, fmt='%s')
