In [1]:
! pip install gdown
import pandas as pd
import gdown
import os


In [2]:
%sh curl -O 'https://raw.githubusercontent.com/MengtingWan/goodreads/master/gdrive_id.csv'

In [3]:
## check where it was saved
%fs ls "file:/databricks/driver"

In [4]:
## read the file as df
path = 'file:/databricks/driver/gdrive_id.csv'
# load data using sqlContext
file_ids = sqlContext.read.format("csv")\
      .option("header", "true")\
      .option("inferSchema", "true")\
      .load(path)

# display in table format
display(file_ids)

id,name
1TLmSvzHvTLLLMjMoQdkx6pBWon-4bli7,goodreads_book_works.json.gz
19cdwyXwfXx_HDIgxXaHzH0mrx8nMyLvC,goodreads_book_authors.json.gz
1op8D4e5BaxU2JcPUgxM3ZqrodajryFBb,goodreads_book_series.json.gz
1LXpK1UfqtP89H1tYy0pBGHjYk8IhigUK,goodreads_books.json.gz
1ah0_KpUterVi-AHxJ03iKD6O0NfbK0md,goodreads_book_genres_initial.json.gz
1R3wJPgyzEX9w6EI8_LmqLbpY4cIC9gw4,goodreads_books_children.json.gz
1ICk5x0HXvXDp5Zt54CKPh5qz1HyUIn9m,goodreads_books_comics_graphic.json.gz
1x8IudloezYEg6qDTPxuBkqGuQ3xIBKrt,goodreads_books_fantasy_paranormal.json.gz
1roQnVtWxVE1tbiXyabrotdZyUY7FA82W,goodreads_books_history_biography.json.gz
1ACGrQS0sX4-26D358G2i5pja1Y6CsGtz,goodreads_books_mystery_thriller_crime.json.gz


## Data Description
- In this notebook I will use **Amazon product data**.
- This dataset contains product reviews and metadata from Amazon, including 142.8 million reviews spanning May 1996 - July 2014.

- This dataset includes reviews (ratings, text, helpfulness votes), product metadata (descriptions, category information, price, brand, and image features), and links (also viewed/also bought graphs).

## Citation

- **Ups and downs: Modeling the visual evolution of fashion trends with one-class collaborative filtering** 
  - R. He, J. McAuley
  WWW, 2016
  - [pdf](http://cseweb.ucsd.edu/~jmcauley/pdfs/www16a.pdf)

- **Image-based recommendations on styles and substitutes**
  - J. McAuley, C. Targett, J. Shi, A. van den Hengel
  SIGIR, 2015
  - [pdf](http://cseweb.ucsd.edu/~jmcauley/pdfs/sigir15.pdf)

In [7]:
fn = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Video_Games.json.gz'

import pandas as pd
import os
print(os.path.isfile(fn))
df = pd.read_json(fn, lines=True, compression='gzip')
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0078764343,"[1, 1]",5,I haven't gotten around to playing the campaig...,"07 7, 2013",AB9S9279OZ3QO,Alan,Good game and Beta access!!,1373155200
1,0078764343,"[0, 0]",5,I want to start off by saying I have never pla...,"08 24, 2013",A24SSUT5CSW8BH,Kindle Customer,Love the game,1377302400
2,0078764343,"[0, 0]",4,this will be my second medal of honor I love h...,"07 4, 2013",AK3V0HEBJMQ7J,"Miss Kris ""Krissy""",MOH nice,1372896000
3,043933702X,"[0, 0]",5,"great game when it first came out, and still a...","07 10, 2014",A10BECPH7W8HM7,"GMC ""Old Time Modeler""",Five Stars,1404950400
4,043933702X,"[0, 0]",5,this is the first need for speed I bought year...,"12 4, 2013",A2PRV9OULX1TWP,grimi,memory lane,1386115200


In [8]:
df = df.drop(columns = ['reviewTime', 'unixReviewTime'], axis = 1)
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewerID,reviewerName,summary
0,0078764343,"[1, 1]",5,I haven't gotten around to playing the campaig...,AB9S9279OZ3QO,Alan,Good game and Beta access!!
1,0078764343,"[0, 0]",5,I want to start off by saying I have never pla...,A24SSUT5CSW8BH,Kindle Customer,Love the game
2,0078764343,"[0, 0]",4,this will be my second medal of honor I love h...,AK3V0HEBJMQ7J,"Miss Kris ""Krissy""",MOH nice
3,043933702X,"[0, 0]",5,"great game when it first came out, and still a...",A10BECPH7W8HM7,"GMC ""Old Time Modeler""",Five Stars
4,043933702X,"[0, 0]",5,this is the first need for speed I bought year...,A2PRV9OULX1TWP,grimi,memory lane


In [9]:
from pyspark.sql.types import *

my_schema = StructType(
  [StructField('reviewerID', StringType(), True),
   StructField('asin', StringType(), True),
   StructField('overall', IntegerType(), True),
   StructField('reviewerName', StringType(), True), 
   StructField('reviewText', StringType(), True), 
   StructField('summary', StringType(), True), 
   StructField('helpful', StringType(), True)]
)


In [10]:
spark_df = spark.createDataFrame(df, schema=my_schema)
type(spark_df)

* reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
* asin - ID of the product, e.g. 0000013714
* reviewerName - name of the reviewer
* helpful - helpfulness rating of the review, e.g. 2/3
* reviewText - text of the review
* overall - rating of the product
* summary - summary of the review

## Data exploration

In [13]:
display(spark_df.show(10))

In [14]:
spark_df.describe().show()

In [15]:
from pyspark.sql.functions import col, skewness, kurtosis

spark_df.select(skewness('overall'), kurtosis('overall')).show()

In [16]:
print((spark_df.count(), len(spark_df.columns)))

- We have **1,324,753** observations and 7 columns.

In [18]:
spark_df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in spark_df.columns)).show()


In [19]:
from pyspark.sql.functions import * 

review_distribution_df = spark_df.groupBy('overall').count()
review_distribution_df.show()