In [1]:
import json

In [2]:
review_coll = json.load(open("reviews.json","r"))

In [3]:
len(review_coll)

1128

In [4]:
import pandas as pd
review_df = pd.DataFrame(review_coll)

In [5]:
review_df.shape

(1128, 8)

In [6]:
review_df.columns

Index([u'author', u'body', u'colour', u'date', u'header', u'rating', u'size',
       u'verified_purchase'],
      dtype='object')

In [7]:
data_df = review_df[["body","header","rating"]]

In [8]:
data_df.shape

(1128, 3)

In [43]:
data_df.groupby("rating")["rating"].count()

rating
1.0    158
2.0     28
3.0     44
4.0    104
5.0    794
Name: rating, dtype: int64

### Train - Validation - Test split

In [9]:
from sklearn.model_selection import train_test_split
train_val_df, test_df = train_test_split(data_df, test_size=0.2)
train_df, val_df = train_test_split(train_val_df, test_size=0.25)

In [10]:
len(train_df), len(val_df), len(test_df)

(676, 226, 226)

### Cleaning Pipeline

In [11]:
# 1. Lowercase
# 2. Remove numbers and special characters
# 3. Remove stop words
# 4. Stemming

In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

ps = PorterStemmer()
def clean(text):
    if not text:
        return ""
    text = text.lower()
    text = re.sub(r'[^A-Za-z]+', ' ', text)
    word_tokens = word_tokenize(text)
    filtered_sentence = list([ps.stem(w) for w in word_tokens if (not w in stop_words and len(w)>1)])
    return " ".join(filtered_sentence)

### Pipeline

In [25]:
def train_pipeline(data_df, vectorizer_, model_):
    data_df["cleaned_text"] = data_df.apply(lambda x: clean(x["text"]),axis=1)
    features_ = vectorizer_.fit_transform(data_df["cleaned_text"])
    model_.fit(features_,data_df["class"])
    return vectorizer_, model_

In [36]:
def test_pipeline(data_df, vectorizer_, model_):
    data_df["cleaned_text"] = data_df.apply(lambda x: clean(x["text"]),axis=1)
    features = vectorizer_.transform(data_df["cleaned_text"])
    return model_.predict(features)

In [27]:
from sklearn.metrics import accuracy_score
def get_accuracy(actual_vals_, predicted_vals_):
    return accuracy_score(actual_vals_, predicted_vals_)

#### Logistic regression - validation

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [29]:
train_df["text"] = train_df.apply(lambda x: x["header"]+"\n"+x["body"],axis=1)
train_df = train_df[["text","rating"]]
train_df.columns = ["text", "class"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
vectorizer, lr = train_pipeline(train_df, vectorizer, lr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [31]:
test_df["text"] = test_df.apply(lambda x: x["header"]+"\n"+x["body"],axis=1)
test_df = test_df[["text","rating"]]
test_df.columns = ["text", "class"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [37]:
predictions = test_pipeline(test_df, vectorizer, lr)

In [38]:
predictions.shape

(226,)

In [39]:
get_accuracy(test_df["class"],predictions)

0.71238938053097345

In [42]:
list(test_df["class"])

[u'5.0',
 u'1.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'1.0',
 u'2.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'3.0',
 u'1.0',
 u'5.0',
 u'5.0',
 u'4.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'5.0',
 u'4.0',
 u'1.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'5.0',
 u'4.0',
 u'4.0',
 u'4.0',
 u'5.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'4.0',
 u'5.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'4.0',
 u'5.0',
 u'4.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'4.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'1.0',
 u'5.0',
 u'5.0',
 u'4.0',
 u'5.0',
 u'5.0',
 u'4.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'4.0',
 u'5.0',
 u'4.0',
 u'5.0',
 u'5.0',
 u'5.0',
 u'1.0',
 u'1.0',
 u'5.0',
 u'1.0',
 u'5.0',
 

In [41]:
predictions

array([u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'1.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
       u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0', u'5.0',
      