In [33]:
import pandas as pd

import re

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

In [18]:
reviews_json_file  = 'dataset/goodreads_reviews_poetry.json'
reviews_json = pd.read_json(reviews_json_file, convert_dates = True, lines = True)

reviews_json.head()

Unnamed: 0,book_id,date_added,date_updated,n_comments,n_votes,rating,read_at,review_id,review_text,started_at,user_id
0,402128,Tue Jun 12 08:59:04 -0700 2012,Fri Jun 15 11:41:12 -0700 2012,0,0,5,,28423ff309bc896c071a8d9df4a10e8a,I have three younger siblings and we grew up w...,,3ca7375dba942a760e53b726c472a7dd
1,92270,Mon Apr 14 18:42:40 -0700 2014,Mon Apr 14 18:43:05 -0700 2014,0,0,5,Wed Jan 01 00:00:00 -0800 1997,2db1180992e2b0b1631a3ac5644bde84,This is my favorite collection of poetry.,,0ef32090550901ead25cb0ea21c4d36b
2,908708,Tue Apr 22 13:58:10 -0700 2008,Tue Apr 22 13:58:33 -0700 2008,0,0,4,,bca57fa40e92c9261b00b03dbebd96fe,"He's so disturbing. So very, very disturbing.",,0ef32090550901ead25cb0ea21c4d36b
3,253264,Wed Sep 27 19:08:08 -0700 2017,Sat Sep 30 06:39:45 -0700 2017,0,1,5,Wed Sep 27 00:00:00 -0700 2017,cb1ebc02d8b2aff15735d513877463ce,I just reread this play for a class I am takin...,Tue Sep 26 00:00:00 -0700 2017,d37b46b2190ed7c518259f29b47a9b36
4,70885,Thu Jun 18 20:00:03 -0700 2015,Thu Jun 18 20:01:29 -0700 2015,0,0,5,Thu Jun 18 00:00:00 -0700 2015,8dca128b8e869048a7442c18659dbece,"Cuanto mas leo, mas me gusta. Su poesia es env...",Tue Jun 16 00:00:00 -0700 2015,af157d0205b8a901dee6d4a2aed7e6ad


In [19]:
reviews = reviews_json['review_text']
reviews

0         I have three younger siblings and we grew up w...
1                 This is my favorite collection of poetry.
2             He's so disturbing. So very, very disturbing.
3         I just reread this play for a class I am takin...
4         Cuanto mas leo, mas me gusta. Su poesia es env...
5         Adorablemente pesimista, deprimente y repulsiv...
6         Poemas crudos y reales, sin tanto adorno aun u...
7         This ain't a book with to die for characters, ...
8                      This is why kids don't like to read.
9         Odysseus is such an arrogant, power-hungry man...
10        Toinen tekija on Olavi Lauri = Paavolainen. Lu...
11                          sTHy wmml w 'Glb lmfrdt tkrrt .
12        k`d@ frwq jwyd@ y'sr lqr bqSy'dh l`dhb@ lmlyy'...
13        jmyl jd [?][?] \n " lmdh 'rki `l~ klW shy bqy ...
14        ry'` sh`ran ... mml nthran \n mjl@ wmlHq ldwH@...
15        Beyond the Words is an anthology of poetry and...
16        Thoughts of a Pure Mind by Cal

In [20]:
reviews = reviews.to_frame()

In [22]:
type(reviews)

pandas.core.frame.DataFrame

In [23]:
reviews['pos_neg'] = [1 if x>3 else 0 for x in reviews_json.rating] 

In [24]:
reviews

Unnamed: 0,review_text,pos_neg
0,I have three younger siblings and we grew up w...,1
1,This is my favorite collection of poetry.,1
2,"He's so disturbing. So very, very disturbing.",1
3,I just reread this play for a class I am takin...,1
4,"Cuanto mas leo, mas me gusta. Su poesia es env...",1
5,"Adorablemente pesimista, deprimente y repulsiv...",1
6,"Poemas crudos y reales, sin tanto adorno aun u...",1
7,"This ain't a book with to die for characters, ...",1
8,This is why kids don't like to read.,0
9,"Odysseus is such an arrogant, power-hungry man...",0


In [36]:
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return text.strip()

reviews['review_text'] = [preprocess_text(t) for t in reviews['review_text']]

In [46]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

tuned_parameters = {
    'vect__ngram_range' : [(1,1), (1,2), (2,2)],
    'tfidf__use_idf' : (True, False),
    'tfidf__norm' : ('l1', 'l2'),
    'clf__alpha' : [1, 1e-1, 1e-2]
}

In [44]:
x_train, x_test, y_train, y_test = train_test_split(reviews['review_text'],reviews['pos_neg'], test_size = 0.2, random_state = 42)
x_train

85137     i read the first two books in the september se...
33589     i couldn t put this book down i m a huge fan o...
81053     i love this book so much every poem i m saving...
103909                                          so much fun
33586     i watched the movie months ago but only last n...
80346     wow what a journey from the beggining to the l...
130810    you haven t read don mckay yet and you call yo...
47537     gilgamesheposet ar ett oerhort intressant verk...
9720      rqyq hlm wtkhrj m f lnfs bklmth lt ttwjd dkhl ...
53081     finished at my parents house eating birthday cake
102956    i ve been reading a bunch of lauren springer l...
69569     abbas kiarostami is one of my favouritest film...
138818    when you find yourself highlighting the majori...
81877     3 5 stars very densely packed metaphors and lo...
44170     really really wanted to like this one but some...
54247     lsrwu fy m stn srkhn dkhmu mn swtn wsyfun twlu...
77687     this is so uncreative and bori

In [47]:
clf = GridSearchCV(text_clf, tuned_parameters, cv=10)
clf.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                           