In [7]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_union

In [3]:
class_names = ['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']

In [69]:
train = pd.read_csv('/Yelp/dataset/model_train.csv', usecols=['text', 'stars'])
test = pd.read_csv('/Yelp/dataset/model_test.csv', usecols=['text', 'stars'])
train = pd.get_dummies(train, columns=['stars'])
test = pd.get_dummies(test, columns = ['stars'])
train = train.sample(n = 100000)

In [70]:
train_text = train['text']
test_text = test['text']

In [71]:
all_text = pd.concat([train_text, test_text])

In [72]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf = True,
    strip_accents = 'unicode',
    analyzer = 'word',
    token_pattern = r'\w{1,}',
    ngram_range=(1,1),
    max_features = 10000)

In [73]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf = True,
    strip_accents = 'unicode',
    analyzer = 'char',
    ngram_range =(1, 4),
    max_features = 10000)

In [74]:
vectorizer = make_union(word_vectorizer, char_vectorizer, n_jobs = 4)

In [75]:
vectorizer.fit(train_text)

FeatureUnion(n_jobs=4,
       transformer_list=[('tfidfvectorizer-1', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=...
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))],
       transformer_weights=None)

In [76]:
train_features = vectorizer.transform(train_text)
test_features = vectorizer.transform(test_text)

In [77]:
train_features

<100000x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 97577500 stored elements in Compressed Sparse Row format>

In [78]:
test_features

<70000x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 68473934 stored elements in Compressed Sparse Row format>

In [65]:
preds[:,1]

array([0., 0., 0., ..., 0., 0., 0.])

In [79]:
scores = []
preds = np.zeros((len(test), len(class_names)))
for i, class_name in enumerate(class_names):
    train_target = train[class_name]
    classifier = LogisticRegression(solver = 'sag')
    
    cv_score = np.mean(cross_val_score(classifier,
                                      train_features, 
                                      train_target, cv = 3, scoring = 'roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    classifier.fit(train_features, train_target)
    preds[:,i] = classifier.predict_proba(test_features)[:,1]
    

CV score for class stars_1 is 0.968511164774012
CV score for class stars_2 is 0.8921356001238253
CV score for class stars_3 is 0.8458568250243662
CV score for class stars_4 is 0.7697406309894004
CV score for class stars_5 is 0.9040553809867156


In [80]:
print('Total CV score is {}'.format(np.mean(scores)))

Total CV score is 0.876059920379664
