In [3]:
# Import packages.
import numpy as np
import pandas as pd
import nltk
import nltk.corpus
import gzip
import json
import re
from nltk.corpus import wordnet

#### Build function to clean text with test data.

In [4]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)
    
    
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF("../../../Downloads/AMAZON_FASHION_5.json.gz")

In [5]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Orange'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
1,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Black (3746...",Tonya B.,Great product and price!,Five Stars,1441324800,,
2,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Gray L...",Tonya B.,Great product and price!,Five Stars,1441324800,,
3,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue (37867...",Tonya B.,Great product and price!,Five Stars,1441324800,,
4,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Pink'}",Tonya B.,Great product and price!,Five Stars,1441324800,,


In [6]:
np.shape(df)

(3176, 12)

In [7]:
# Drop duplicate reviews
df_nodup = df.drop_duplicates(subset = ['reviewText'])

In [8]:
def tokenize_text(doc):
    """
    Input: A string of words.
    Output: List of tokenized words that are all lowercase.
    """

    # Tokenize and make lowercase.
    words = nltk.word_tokenize(doc)
    words = [w.lower() for w in words]
    
    return words


def wordnet_pos(tag):
    """
    Map a Brown POS tag to a WordNet POS tag. This is for lemmatization.
    """
    
    table = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    
    # Default to a noun.
    return table.get(tag[0], wordnet.NOUN)


def lemmatize_text(words):
    """
    Input: A list of tokenized words.
    Output: A list of tokenized words that are lemmatized.
    """
    
    lemmatizer = nltk.WordNetLemmatizer()
    word_tags = nltk.pos_tag(words)
    words = [lemmatizer.lemmatize(w, wordnet_pos(t)) for (w, t) in word_tags]
    
    return words


def remove_stopwords(words):
    """
    Input: A list of tokenized words.
    Output: A list of tokenized words that have stopwords removed.
    """
    
    stopwords = nltk.corpus.stopwords.words("english")
    words = [w for w in words if w not in stopwords]
    
    return words

def clean_text(doc): 
    """
    Input: A string of words.
    Output: A string of words that has been lemmatized, has the stopwords removed, and has the puncuation removed.
    """
    
    words = re.sub("< ?/?[a-z]+ ?>|\n", "", doc)
    words = tokenize_text(words)
    words = lemmatize_text(words)
    words = remove_stopwords(words)
    doc = [w for w in words if w.isalnum()]
    doc = ' '.join(doc)
    
    return doc

def clean_df(df):
    """
    Input: A dataframe with a column of reviews called 'reviewText'.
    Output: The same dataframe as the input, but with an extra column called 'text' which has the 
            cleaned 'reviewText'.
    """
    
    text = df['reviewText']
    df_clean = df.copy()
    df_clean['text'] = [clean_text(str(i)) for i in text]

    return df_clean

In [9]:
print(df['reviewText'][10])
print(clean_text(df['reviewText'][10]))

Relieved my Plantar Fascitis for 3 Days. Then the unbearable pain returned in full force. These were recommended by my Podiatrist.
relieve plantar fascitis 3 day unbearable pain return full force recommend podiatrist


In [10]:
print(df['reviewText'][300])
print(clean_text(df['reviewText'][300]))

Love these sneakers. Light weight and comfortable even without socks.
love sneaker light weight comfortable even without sock


In [11]:
# Test the function
clean_df(df_nodup)[['reviewText', 'text']].head()

Unnamed: 0,reviewText,text
0,Great product and price!,great product price
5,Waaay too small. Will use for futur children!,waaay small use futur child
6,Stays vibrant after many washes,stay vibrant many wash
8,My son really likes the pink. Ones which I was...,son really like pink one nervous
9,Waaay too small. Will use for future child.,waaay small use future child


In [12]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')

# tmp = df['text']

# vectorizer.fit(tmp)

# tmp_tfidf = vectorizer.transform(tmp)

#### Clean the amazon data.

In [13]:
# Clean the amazon text.
amazon = pd.read_csv('~/Downloads/GroupProject/AMAZON.csv')

In [15]:
amazon.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,Category
0,1,True,"05 11, 2016",A2E4NSDWJDXHYL,B00ZWWMHTI,,M. O.,Did not fit properly no matter what I did. Eve...,Not for me.,1462924800,,,Video_Games_5.json
1,2,True,"08 23, 2016",A1RZ7GEW807WFX,B01B298Q0S,{'Platform:': ' Xbox One'},Valerie Reed,Meh. Underwhelming,Two Stars,1471910400,,,Video_Games_5.json
2,5,True,"06 29, 2014",A1NWDBD77LQ3MB,B00104KJ4C,{'Format:': ' Video Game'},X,I love it and world recommend this game to my ...,Arrived Fast & Got What I Order~!,1404000000,,,Video_Games_5.json
3,5,True,"02 15, 2014",A1487V05K2FBK4,B000X9FV5M,{'Format:': ' Video Game'},Roger Huston,Far Cry 2 is a very enjoyable FPS. The immens...,"Fun, fun, fun game",1392422400,,,Video_Games_5.json
4,5,True,"03 4, 2014",A2N8ZEA5I6TOPA,B00GV4V8XC,"{'Edition:': ' Standard', 'Platform:': ' Ninte...",Daniel G.,"There is much to like about the game, even if ...",Awesome game,1393891200,,,Video_Games_5.json


In [21]:
clean_data = clean_df(amazon)

In [23]:
clean_data.to_csv(r'~/Documents/ECS171/reviewClassifier/clean_data.csv')

In [24]:
clean_data

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,Category,text
0,1,True,"05 11, 2016",A2E4NSDWJDXHYL,B00ZWWMHTI,,M. O.,Did not fit properly no matter what I did. Eve...,Not for me.,1462924800,,,Video_Games_5.json,fit properly matter even could get work right ...
1,2,True,"08 23, 2016",A1RZ7GEW807WFX,B01B298Q0S,{'Platform:': ' Xbox One'},Valerie Reed,Meh. Underwhelming,Two Stars,1471910400,,,Video_Games_5.json,meh underwhelming
2,5,True,"06 29, 2014",A1NWDBD77LQ3MB,B00104KJ4C,{'Format:': ' Video Game'},X,I love it and world recommend this game to my ...,Arrived Fast & Got What I Order~!,1404000000,,,Video_Games_5.json,love world recommend game friend complaint als...
3,5,True,"02 15, 2014",A1487V05K2FBK4,B000X9FV5M,{'Format:': ' Video Game'},Roger Huston,Far Cry 2 is a very enjoyable FPS. The immens...,"Fun, fun, fun game",1392422400,,,Video_Games_5.json,far cry 2 enjoyable fps immense open gameplay ...
4,5,True,"03 4, 2014",A2N8ZEA5I6TOPA,B00GV4V8XC,"{'Edition:': ' Standard', 'Platform:': ' Ninte...",Daniel G.,"There is much to like about the game, even if ...",Awesome game,1393891200,,,Video_Games_5.json,much like game even fault scatter throughout e...
5,5,True,"01 13, 2017",A1Z3NP8LJB8PQO,B003YMMGEE,{'Platform:': ' PC'},Jarhead,Bought as gift?,Great Game.,1484265600,,,Video_Games_5.json,buy gift
6,5,True,"07 14, 2014",A1Z7TAQGXOC4QL,B00109KMOO,{'Platform:': ' Xbox 360'},Khalil Rahi,"Great service, arrived sooner then excepted. A++",Five Stars,1405296000,,,Video_Games_5.json,great service arrive sooner except
7,5,True,"03 23, 2015",A2RODTBQJX7QM7,B00BAWXCP2,{'Style:': ' Standard Edition'},Luis Soto,Great game. Lengthy and enjoyable. New game+ a...,Great game. Lengthy and enjoyable,1427068800,,,Video_Games_5.json,great game lengthy enjoyable new vita get game...
8,4,False,"08 24, 2006",A1K31NF81TS0CO,B0002KMHD6,,Dubyac99,"I have to say, first of all, I made a killing ...","Fun, but hard to control at times...",1156377600,4,,Video_Games_5.json,say first make killing game buy go business sa...
9,5,False,"07 4, 2002",A3ER878Z9WVNRO,B0000631WO,,Kopaka Nuva,"This is a very good game to play, but it didn'...",A very good dragonball z game,1025740800,5,,Video_Games_5.json,good game play follow storyline good first lea...
