In [42]:
import numpy as np
import pandas as pd
import nltk
import nltk.corpus
import gzip
import json
import re
from nltk.corpus import wordnet

In [12]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)
    
    
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF("../../Downloads/AMAZON_FASHION_5.json.gz")

In [37]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Orange'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
1,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Black (3746...",Tonya B.,Great product and price!,Five Stars,1441324800,,
2,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Gray L...",Tonya B.,Great product and price!,Five Stars,1441324800,,
3,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue (37867...",Tonya B.,Great product and price!,Five Stars,1441324800,,
4,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Pink'}",Tonya B.,Great product and price!,Five Stars,1441324800,,


In [31]:
np.shape(df)

(3176, 12)

In [120]:
# Drop duplicate reviews
df_nodup = df.drop_duplicates(subset = ['reviewText'])

In [128]:
def tokenize_text(doc):
    """
    Input: A string of words.
    Output: List of tokenized words that are all lowercase.
    """

    # Tokenize and make lowercase.
    words = nltk.word_tokenize(doc)
    words = [w.lower() for w in words]
    
    return words


def wordnet_pos(tag):
    """
    Map a Brown POS tag to a WordNet POS tag. This is for lemmatization.
    """
    
    table = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    
    # Default to a noun.
    return table.get(tag[0], wordnet.NOUN)


def lemmatize_text(words):
    """
    Input: A list of tokenized words.
    Output: A list of tokenized words that are lemmatized.
    """
    
    lemmatizer = nltk.WordNetLemmatizer()
    word_tags = nltk.pos_tag(words)
    words = [lemmatizer.lemmatize(w, wordnet_pos(t)) for (w, t) in word_tags]
    
    return words


def remove_stopwords(words):
    """
    Input: A list of tokenized words.
    Output: A list of tokenized words that have stopwords removed.
    """
    
    stopwords = nltk.corpus.stopwords.words("english")
    words = [w for w in words if w not in stopwords]
    
    return words

def clean_text(doc): 
    """
    Input: A string of words.
    Output: A string of words that has been lemmatized, has the stopwords removed, and has the puncuation removed.
    """
    
    words = re.sub("< ?/?[a-z]+ ?>|\n", "", doc)
    words = tokenize_text(words)
    words = lemmatize_text(words)
    words = remove_stopwords(words)
    doc = [w for w in words if w.isalnum()]
    doc = ' '.join(doc)
    
    return doc

def clean_df(df):
    """
    Input: A dataframe with a column of reviews called 'reviewText'.
    Output: The same dataframe as the input, but with an extra column called 'text' which has the 
            cleaned 'reviewText'.
    """
    
    text = df['reviewText']
    df_clean = df.copy()
    df_clean['text'] = [clean_text(str(i)) for i in text]

    return df_clean

In [44]:
print(df['reviewText'][10])
print(clean_text(df['reviewText'][10]))

Relieved my Plantar Fascitis for 3 Days. Then the unbearable pain returned in full force. These were recommended by my Podiatrist.
relieve plantar fascitis 3 day unbearable pain return full force recommend podiatrist


In [132]:
print(df['reviewText'][300])
print(clean_text(df['reviewText'][300]))

Love these sneakers. Light weight and comfortable even without socks.
love sneaker light weight comfortable even without sock


In [131]:
# Test the function
clean_df(df_nodup)[['reviewText', 'text']]

Unnamed: 0,reviewText,text
0,Great product and price!,great product price
5,Waaay too small. Will use for futur children!,waaay small use futur child
6,Stays vibrant after many washes,stay vibrant many wash
8,My son really likes the pink. Ones which I was...,son really like pink one nervous
9,Waaay too small. Will use for future child.,waaay small use future child
10,Relieved my Plantar Fascitis for 3 Days. Then ...,relieve plantar fascitis 3 day unbearable pain...
11,This is my 6th pair and they are the best thin...,6th pair best thing ever plantar fasciitis res...
12,We have used these inserts for years. They pr...,use insert year provide great support
13,Pinnacle seems to have more cushioning so my h...,pinnacle seem cushioning husband like well try...
14,Excellent insole with good support.,excellent insole good support


In [135]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')

tmp = df['text']

vectorizer.fit(tmp)

tmp_tfidf = vectorizer.transform(tmp)

In [136]:
tmp_tfidf

<3176x9341 sparse matrix of type '<class 'numpy.float64'>'
	with 101439 stored elements in Compressed Sparse Row format>