In [1]:
import numpy as np 
import json
import time

In [2]:
def json_reader(fname):
    """
        Read multiple json files
        Args:
            fname: str: input file
        Returns:
            generator: iterator over documents 
    """
    for line in open(fname, mode="r"):
        yield json.loads(line)

In [3]:
train_json = "C:/IITD/sem5/col774-ml/datasets/col774_yelp_data/col774_yelp_data/train.json"
test_json = "C:/IITD/sem5/col774-ml/datasets/col774_yelp_data/col774_yelp_data/test.json"

y = []
docs = []

for review in json_reader(train_json):
    y.append(int(review["stars"]))
    docs.append(review["text"])

y_train = np.asarray(y)

In [4]:
y_test = []
docs_test = []

for review in json_reader(test_json):
    y_test.append(int(review["stars"]))
    docs_test.append(review["text"])

y_test = np.asarray(y_test)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [8]:
# naive bayes
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', MultinomialNB()),
])

text_clf.fit(docs, y_train)

Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [9]:
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test)

0.46974229348330065

In [8]:
predicted = svm_clf.predict(docs_test)
np.mean(predicted == y_test)

0.6697527632779431

In [19]:
from sklearn.linear_model import SGDClassifier
sgd_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(verbose=10)),
])

sgd_clf.fit(docs, y_train)

In [11]:
predicted = sgd_clf.predict(docs_test)
np.mean(predicted == y_test)

0.6332580505242376

In [12]:
from sklearn import metrics
metrics.confusion_matrix(y_test, predicted)

array([[18123,   216,   314,   152,  1364],
       [ 5865,   727,  1738,   983,  1525],
       [ 2655,   419,  3078,  4393,  3986],
       [ 1073,   118,  1002,  6268, 20897],
       [  773,    39,   194,  1334, 56482]], dtype=int64)

In [25]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-3, 1e-4, 1e-5),
}

sgd_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(verbose=1)),
])

gs_clf = GridSearchCV(sgd_clf, parameters, cv=5, n_jobs=-1)

# gs_clf.get_params().keys()
gs_clf = gs_clf.fit(docs[2000:12000], y_train[2000:12000])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
-- Epoch 1
Norm: 51.11, NNZs: 152236, Bias: -1.088210, T: 10000, Avg. loss: 0.221475
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 45.98, NNZs: 186141, Bias: -0.819930, T: 20000, Avg. loss: 0.105143
Total training time: 0.02 seconds.
-- Epoch 3
Norm: 44.42, NNZs: 199504, Bias: -0.777342, T: 30000, Avg. loss: 0.082523
Total training time: 0.03 seconds.
-- Epoch 4
Norm: 44.21, NNZs: 208083, Bias: -0.731726, T: 40000, Avg. loss: 0.072955
Total training time: 0.05 seconds.
-- Epoch 5
Norm: 43.93, NNZs: 212479, Bias: -0.706129, T: 50000, Avg. loss: 0.065632
Total training time: 0.05 seconds.
-- Epoch 6
Norm: 43.66, NNZs: 215714, Bias: -0.704410, T: 60000, Avg. loss: 0.061367
Total training time: 0.07 seconds.
-- Epoch 7
Norm: 43.63, NNZs: 217816, Bias: -0.702810, T: 70000, Avg. loss: 0.058718
Total training time: 0.08 seconds.
-- Epoch 8
Norm: 43.61, NNZs: 219511, Bias: -0.683058, T: 80000, Avg. loss: 0.0

In [30]:
print(gs_clf.best_score_)

for param_name in sorted(parameters.keys()):
    print("%s: %r: %s" % (param_name, gs_clf.best_params_[param_name], type(gs_clf.best_params_[param_name])))

0.6453
clf__alpha: 0.0001: <class 'float'>
tfidf__use_idf: True: <class 'bool'>
vect__ngram_range: (1, 2): <class 'tuple'>


In [32]:
vect__ngram_range = gs_clf.best_params_['vect__ngram_range']
tfidf__use_idf = gs_clf.best_params_['tfidf__use_idf']
clf__alpha = gs_clf.best_params_['clf__alpha']
print(type(vect__ngram_range), tfidf__use_idf, clf__alpha)

<class 'tuple'> True 0.0001


In [35]:
t0 = time.time()

sgd_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=vect__ngram_range)),
    ('tfidf', TfidfTransformer(use_idf=tfidf__use_idf)),
    ('clf', SGDClassifier(alpha=clf__alpha)),
])

sgd_clf.fit(docs, y_train)

t1 = time.time()
print(t1 - t0)

184.72967553138733


In [36]:
predicted = sgd_clf.predict(docs_test)
np.mean(predicted == y_test)

0.6333627484706621

In [7]:
t0 = time.time()

svm_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(verbose=1)),
])

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'clf__C': (1e-3, 1, 5),
}

gs_clf = GridSearchCV(svm_clf, parameters, cv=5, n_jobs=-1)

gs_clf = gs_clf.fit(docs[:int(len(docs)/5)], y_train[:int(len(docs)/5)])

t1 = time.time()
print(t1 - t0)

print(gs_clf.best_score_)

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

[LibLinear]432.71066641807556
0.6765382238661704
clf__C: 1
vect__ngram_range: (1, 2)


In [8]:
t0 = time.time()
svm_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LinearSVC(verbose=1, C=1)),
])

svm_clf.fit(docs, y_train)
t1 = time.time()
print(t1 - t0)

predicted = svm_clf.predict(docs_test)
np.mean(predicted == y_test)

[LibLinear]