# Preprocessing and Parsing Raw Data

This notebook is concerned with parsing and then preprocessing the raw data from the `data/raw` folder, and then storing the new format (csv's) in the `data/processed` folder.

In [1]:
# Importing the relevant libraries
import pandas as pd
import os
import json
import gzip

 ** Amazon Prime Pantry Data **

In [2]:
data = []
with gzip.open('../../data/raw/amazon_prime_pantry.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print("Number of reviews:", len(data))

# first row of the list
print("Example of the data (first row):", data[0])

Number of reviews: 137788
Example of the data (first row): {'overall': 4.0, 'verified': True, 'reviewTime': '09 24, 2015', 'reviewerID': 'A31Y9ELLA1JUB0', 'asin': 'B0000DIWNI', 'reviewerName': 'Her Royal Peepness Princess HoneyBunny Blayze', 'reviewText': "I purchased this Saran premium plastic wrap after trying Reynolds press and seal wrap which I would never use again.. There is less static cling to this wrap than I remember. To me this is a good thing because it doesn't stick to its self .\n\nThis is my typical complaint with all plastic wraps. When trying to cut them they ball all up and are useless. However they have improved this. Now Saran clings to the bowl or plate you wish to cover.\n\nNow if only they could improve the cutters on the boxes so  that the cutters actually cut and scissors weren't required would be better..", 'summary': 'Pretty Good For plastic Wrap', 'unixReviewTime': 1443052800}


In [3]:
df = pd.DataFrame.from_dict(data)  # Parsing the data as a pandas dataframe 
df.head(5)  # Displayin the first 5 rows of the dataframe below

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,B0000DIWNI,,4.0,I purchased this Saran premium plastic wrap af...,"09 24, 2015",A31Y9ELLA1JUB0,Her Royal Peepness Princess HoneyBunny Blayze,,Pretty Good For plastic Wrap,1443052800,True,
1,B0000DIWNI,,5.0,I am an avid cook and baker. Saran Premium Pl...,"06 23, 2015",A2FYW9VZ0AMXKY,Mary,,"The Best Plastic Wrap for your Cooking, Baking...",1435017600,True,
2,B0000DIWNI,,5.0,"Good wrap, keeping it in the fridge makes it e...","06 13, 2015",A1NE43T0OM6NNX,Tulay C,,Good and strong.,1434153600,True,
3,B0000DIWNI,,4.0,I prefer Saran wrap over other brands. It does...,"06 3, 2015",AHTCPGK2CNPKU,OmaShops,,Doesn't cling as well to dishes as other brand...,1433289600,True,
4,B0000DIWNI,,5.0,Thanks,"04 20, 2015",A25SIBTMVXLB59,Nitemanslim,,Five Stars,1429488000,True,


In [4]:
# Saving the data to the processed folder
df.to_csv('../../data/processed/amazon_prime_pantry.csv')

 ** Amazon Gourmet Food and Groceries Data **

In [5]:
data = []
with gzip.open('../../data/raw/amazon_grocery_gourmet.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print("Number of reviews:", len(data))

# first row of the list
print("Example of the data (first row):", data[0])

Number of reviews: 1143860
Example of the data (first row): {'overall': 5.0, 'verified': True, 'reviewTime': '11 19, 2014', 'reviewerID': 'A1QVBUH9E1V6I8', 'asin': '4639725183', 'reviewerName': 'Jamshed Mathur', 'reviewText': 'No adverse comment.', 'summary': 'Five Stars', 'unixReviewTime': 1416355200}


In [6]:
df = pd.DataFrame.from_dict(data)  # Parsing the data as a pandas dataframe 
df.head(5)  # Displaying the first 5 rows of the dataframe below

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,4639725183,,5.0,No adverse comment.,"11 19, 2014",A1QVBUH9E1V6I8,Jamshed Mathur,,Five Stars,1416355200,True,
1,4639725183,,5.0,Gift for college student.,"10 13, 2016",A3GEOILWLK86XM,itsjustme,,Great product.,1476316800,True,
2,4639725183,,5.0,"If you like strong tea, this is for you. It mi...","11 21, 2015",A32RD6L701BIGP,Krystal Clifton,,Strong,1448064000,True,
3,4639725183,,5.0,Love the tea. The flavor is way better than th...,"08 12, 2015",A2UY1O1FBGKIE6,U. Kane,,Great tea,1439337600,True,
4,4639725183,,5.0,I have searched everywhere until I browsed Ama...,"05 28, 2015",A3QHVBQYDV7Z6U,The Nana,,This is the tea I remembered!,1432771200,True,


In [7]:
# Saving the data to the processed folder
df.to_csv('../../data/processed/amazon_grocery_gourmet.csv')

 ** Yelp Data **

In [8]:
yelp_file = open('../../data/raw/yelp_all', 'r')  # Opening the file

In [9]:
lines = yelp_file.readlines()  # Storing each line as an element in a list
print(lines[0])  # printing the first line

5044	0	2014-11-16	Drinks were bad, the hot chocolate was watered down and the latte had a burnt taste to it. The food was also poor quality, but the service was the worst part, their cashier was very rude.



In [10]:
# Creating a list of dictionaries (each element is a row)
rows = []
for line in lines:
    current_row_list = line.split("\t")
    current_row_dict = {
        "productID": current_row_list[0],
        "fakeLabel": current_row_list[1],
        "date": current_row_list[2],
        "reviewText": current_row_list[3]
    }
    rows.append(current_row_dict)
print(rows[0])  # Printing the first row

{'productID': '5044', 'fakeLabel': '0', 'date': '2014-11-16', 'reviewText': 'Drinks were bad, the hot chocolate was watered down and the latte had a burnt taste to it. The food was also poor quality, but the service was the worst part, their cashier was very rude.\n'}


In [11]:
df = pd.DataFrame.from_dict(rows)  # Parsing the data as a pandas dataframe 
df.head(5)  # Displaying the first 5 rows of the dataframe below

Unnamed: 0,date,fakeLabel,productID,reviewText
0,2014-11-16,0,5044,"Drinks were bad, the hot chocolate was watered..."
1,2014-09-08,0,5045,This was the worst experience I've ever had a ...
2,2013-10-06,0,5046,This is located on the site of the old Spruce ...
3,2014-11-30,0,5047,I enjoyed coffee and breakfast twice at Toast ...
4,2014-08-28,0,5048,I love Toast! The food choices are fantastic -...


In [12]:
# Saving the data to the processed folder
df.to_csv('../../data/processed/yelp_all.csv')