In [10]:
import pandas as pd
import numpy as np
import json
import gzip

In [11]:
###Helper function to extract the contents of the zip file
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

In [12]:
#The below function returns the data in a pandas dataframe. It uses the function
#defined above to extract contents of the zip file.

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [13]:
true = True
false = False

In [14]:
#The snippet below is creating an instance of the getDF() function and passing 
#the path of the zip file as an argument to the getDF() function
df = getDF("Toys_and_Games_5.json.gz")

In [15]:
len(df)

1828971

In [16]:
df["vote"].isna().sum()

1620926

In [17]:
df["vote"] = df["vote"].dropna()

In [18]:
df1 = df.dropna()

In [19]:
len(df1)

6763

In [20]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
import nltk
#nltk.download('stopwords')  
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
df1["summary_clean"] = df1["summary"].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [21]:
df1["summary_clean"]

758        great tool illustrate teach financial manageme...
1754                                                creative
1757                                             damage book
1810                                       he's always watch
1823          george elf claus intelligence agency operative
                                 ...                        
1825841                  great experience first time builder
1826921                               seriously disappointed
1826983                questionable....first impression good
1827970                                  plush soft/huggable
1828147                     schedule pick return immediately
Name: summary_clean, Length: 6763, dtype: object

In [22]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
df1["sentiments"] = df1["summary_clean"].apply(lambda x: sid.polarity_scores(x))
df1 = pd.concat([df1.drop(['sentiments'], axis=1), df1['sentiments'].apply(pd.Series)], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [23]:
df1

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image,summary_clean,neg,neu,pos,compound
758,5.0,17,True,"02 26, 2016",A2374KS5MTBWKI,0972428216,{'Format:': ' Toy'},QT,Great tool to teach and I llustrate financial ...,Great tool to illustrate and teach financial m...,1456444800,[https://images-na.ssl-images-amazon.com/image...,great tool illustrate teach financial manageme...,0.000,0.631,0.369,0.6249
1754,5.0,2,True,"11 12, 2016",A2O9S0T5KQDSRL,0976990709,{'Format:': ' Hardcover'},Holly,It's not even Christmas and he has been so bus...,How creative you can be!,1478908800,[https://images-na.ssl-images-amazon.com/image...,creative,0.000,0.000,1.000,0.4404
1757,1.0,2,True,"10 21, 2016",A1FO5XIXNQWVBY,0976990709,{'Format:': ' Hardcover'},CGL,Book came damaged! Tha back part was unattache...,Damaged book,1477008000,[https://images-na.ssl-images-amazon.com/image...,damage book,0.762,0.238,0.000,-0.4939
1810,5.0,5,True,"01 28, 2016",A34QXH38Z7PV0V,0976990709,{'Format:': ' Hardcover'},M.W.,I LOVE this product. I could not be anymore sa...,He's Always Watching!,1453939200,[https://images-na.ssl-images-amazon.com/image...,he's always watch,0.000,1.000,0.000,0.0000
1823,5.0,12,True,"12 29, 2015",A1PMVKG3NE98YF,0976990709,{'Format:': ' Hardcover'},ReservoirHogs,George the Elf arrived promptly at the end of ...,"George the Elf, Claus Intelligence Agency Oper...",1451347200,[https://images-na.ssl-images-amazon.com/image...,george elf claus intelligence agency operative,0.000,0.617,0.383,0.4767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1825841,5.0,2,True,"08 21, 2017",AR9S5T80237ZI,B01FKE83MW,{'Color:': ' Silver'},M,Great kit. Helped my 6 year old nephew put it...,Great experience for first time builder,1503273600,[https://images-na.ssl-images-amazon.com/image...,great experience first time builder,0.000,0.494,0.506,0.6249
1826921,1.0,2,True,"07 14, 2018",A2F338S8LAKAG,B01G3UI30Y,{'Color:': ' Blue'},Sebastiaan Sligting,The wood used for this ladder is average to po...,Seriously disappointed...,1531526400,[https://images-na.ssl-images-amazon.com/image...,seriously disappointed,1.000,0.000,0.000,-0.5859
1826983,4.0,5,True,"06 2, 2016",A2P216HQVIR2YX,B01G5669KW,{'Color:': ' Green'},Jerry Pham,Just got my xsoul hexblades.....not looking so...,Questionable....First impression not good,1464825600,[https://images-na.ssl-images-amazon.com/image...,questionable....first impression good,0.000,0.172,0.828,0.5859
1827970,1.0,3,True,"12 8, 2016",A6CQ1AWMD7J2Y,B01GSYAMWC,{'Style:': ' Bacon'},Elyssabeth,WHAT!? This is not soft and huggable AT ALL. I...,NOT PLUSH OR SOFT/HUGGABLE,1481155200,[https://images-na.ssl-images-amazon.com/image...,plush soft/huggable,0.000,1.000,0.000,0.0000


In [25]:
df1.to_csv('summary_updated_1000.csv', index = False)