# Setup - This notebook runs on google colab

## Google Drive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

## Chrome Driver

In [None]:
# install chromium, its driver, and selenium
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
# set options to be headless, ..

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(chrome_options=chrome_options)

## Import Packages

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import pickle
import time
import pandas as pd
import numpy as np

# Web scraping for review

## Getting product IDs and image links

In [None]:
# No. of pages
start = 1
end = 301

# Initialize dictionary
id_link = {}

# Loop over all pages
for i in range(start,end+1):
  # print iteration number
  if i%20 == 0:
    print(i)
  
  # create url
  url = "https://www.renttherunway.com/products?sort=recommended&filters%5Bzip_code%5D=10001&_=1574453268488&page=" + str(i)

  # get response
  driver.get(url)
  time.sleep(2)
  response_content = driver.page_source
  results_page = BeautifulSoup(response_content,'lxml')

  # get button tags
  button_tags = results_page.find_all('button',{"class":"heart__button heart__button--minimal"})

  # get item_ids (data-style-name attribute)
  style_name = []
  for item in button_tags:
    style_name.append((item.get('data-style-name')))  

  #  get div tags for images
  div_tags = results_page.find_all('div',{"class":"grid-product-card-image cycle-image cycle-image-0"})

  # get img tags for images
  img_tags = []
  for item in div_tags:
    img_tags.append(item.find('img'))

  # get item links
  item_links = []
  for item in img_tags:
    item_links.append(item.get('src'))

  # create dictionary of item_id, item_link
  for k,v in zip(style_name,item_links):
    id_link[k] = v

In [None]:
with open('id_link.pkl', 'wb') as f:
    pickle.dump(id_link, f)

# save to drive
link = 'https://drive.google.com/open?id=11mLhhzlzjCeArB4oE0F957IgaxHdG5et'
_, id = link.split("=")

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': id}]})
file.SetContentFile('id_link.pkl')
file.Upload() 

## Get all Product attributes and Review data

### Define function for one Product URL

In [None]:
def get_metadata(url):
  
  #get response
  response = requests.get(url)
  results_page = BeautifulSoup(response.content,'lxml')

  # get script tag
  script_tags = results_page.find_all('script')

  # find text beginning with ReactReduxInitializer
  for item in script_tags:
    text = item.get_text().strip()
    if text.startswith('ReactReduxInitializer'):
      script_string = text

  # Convert to JSON
  pos = script_string.find('{') #find {
  json_string = script_string[pos:-2] #remove ;
  metadata = json.loads(json_string) #load to json dict
  product_data = metadata['product'] #get product dict
  review_data = metadata['reviews'] #get review dict
  return product_data, review_data

### Run over all Product URLs

In [None]:
product_data_list = []
review_data_list = []
i = 1
for id_ in id_link.keys():
  print(i)
  i+=1
  url = 'https://www.renttherunway.com/product_reviews/' + id_
  product_data, review_data = get_metadata(url)
  product_data_list.append(product_data)
  review_data_list.append(review_data)

In [None]:
# make pickle file
pkl_product = 'product_data_list' + str(start) + '_' + str(end) + '.pkl'
pkl_review = 'review_data_list' + str(start) + '_' + str(end) + '.pkl'

with open(pkl_product, 'wb') as f:
    pickle.dump(product_data_list, f)

# save to drive
link = 'https://drive.google.com/open?id=1tQpenCBv0HU9ekuEvrA1gdqg99mxnUL8'
_, id = link.split("=")

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': id}]})
file.SetContentFile(pkl_product)
file.Upload() 

with open(pkl_review, 'wb') as f:
    pickle.dump(review_data_list, f)

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': id}]})
file.SetContentFile(pkl_review)
file.Upload() 


### Download pickle files to Colab Cloud

In [None]:
# choose a local (colab) directory to store the data.
local_download_path = os.path.expanduser('~/data/pickle/')
try:
  os.makedirs(local_download_path)
except: pass

# 2. Auto-iterate using the query syntax
#    https://developers.google.com/drive/v2/web/search-parameters
file_list = drive.ListFile(
    {'q': "'11mLhhzlzjCeArB4oE0F957IgaxHdG5et' in parents"}).GetList()

for f in file_list:
  # 3. Create & download by id.
  print('title: %s, id: %s' % (f['title'], f['id']))
  fname = os.path.join(local_download_path, f['title'])
  print('downloading to {}'.format(fname))
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(fname)

In [None]:
# retrieve data from pickle files
pkl_product_file = '/root/data/pickle/all_product_data_list.pkl'
pkl_review_file = '/root/data/pickle/all_review_data_list.pkl'

# product_data_list
with open(pkl_product_file, 'rb') as f:
  product_data_list = pickle.load(f)
  
# product_review_list
with open(pkl_review_file, 'rb') as f:
  review_data_list = pickle.load(f)

# print(len(product_data_list), len(product_review_list))

In [None]:
# retrieve id_link from pickle file
with open('id_link.pkl', 'rb') as f:
  id_link = pickle.load(f)

## Get Product attributes based on Product ID

### Define function for individual product

In [None]:
def get_product_details(product_data):

  # get product id
  k = product_data['id']

  # initialize dict
  inner_dict = {}

  # define columns to keep
  cols_to_keep = ['ageRanges','bodyTypes','colors','created','designer','displayName','embellishments','formality','formalityScore','length','neckline',\
                  'occasions','price','productDetails','retailPrice','season','sleeve','stylistNotes']

  # iterate over all attributes
  for key in product_data.keys():
    if key in cols_to_keep:
      if key == 'designer':
        inner_dict['designer_name'] = product_data[key].get('displayName')
      elif key == 'price':
        inner_dict['price_base'] = product_data[key].get('base')
        inner_dict['price_adjusted'] = product_data[key].get('adjusted')
      else:
        inner_dict[key] = product_data[key]
  return k, inner_dict

### Run over all products

In [None]:
# intialize dict
product_dict = {}

# iterate over all products
for item in product_data_list:
  k, inner_dict = get_product_details(item)
  product_dict[k] = inner_dict

## Get user reviews for products

### Define function for getting reviews for one product from all users

In [None]:
def get_product_review(review_data):

  # If there is at least one review
  if len(review_data['data']) != 0:
    # get product id
    item_id = review_data['data'][0].get('moment')['styleName']

    # initialize dictionary
    inner_dict = {}

    # get review summary
    inner_dict['summary'] = {'count': review_data.get('count'), 
                            'currentCount': review_data.get('currentCount'), 
                              'averageRating': review_data.get('averageRating')
                              }

    # get individual reviews
    n_reviews = len(review_data['data'])

    # initialize dictionary
    inner_dict['reviews'] = {}

    # iterate over all reviews by users
    for i in range(n_reviews):

      # get data
      data = review_data['data'][i].get('moment')
      user_dict = {}
      user_review_dict = {}

      # get user attributes
      user_id = data['userId'] #user id

      # user attributes
      user_dict = {'age': review_data['data'][i]['user'].get('age'),
                  'birthday': review_data['data'][i]['user'].get('birthday'),
                  'bodyType' : review_data['data'][i]['user'].get('bodyType'),
                  'bustSize' : review_data['data'][i]['user'].get('bustSize'),
                  'numReviewsByUser' : data.get('numReviewsByUser'),
                  'height' : review_data['data'][i]['user'].get('height'),
                  'heightInches' : review_data['data'][i]['user'].get('heightInches'),
                  'joined' : review_data['data'][i]['user'].get('joined'),
                  'nickName' : review_data['data'][i]['user'].get('nickName'),
                  'standardSize' : review_data['data'][i]['user'].get('standardSize'),
                  'usStandardSize' : review_data['data'][i]['user'].get('usStandardSize'),
                  'weight' : review_data['data'][i]['user'].get('weight'),
                  'weightPounds' : review_data['data'][i]['user'].get('weightPounds'),
                  }

      # get review data
      user_review_dict = {
                          'caption': data.get('caption'),
                          'content': data.get('content'),
                          'fit': data.get('fit'),
                          'rating': data.get('rating'),
                          'reviewId': data.get('reviewId'),
                          'uploadedAt': data.get('uploadedAt')
                          }

      # set review data to dictionary
      inner_dict['reviews'][user_id] = {
                                        'userData': user_dict, 
                                        'userReview': user_review_dict
                                        }

  # if no reviews return None
  else:
    item_id, inner_dict = (None, None)
  return item_id, inner_dict

### Getting reviews for all products from all users

In [None]:
# initialize dictionary
all_review_dict = {}

# iterate over all products
for item in review_data_list:
  # get product reviews for a product
  item_id, review_data = get_product_review(item)

  # if there is more than 1 review
  if review_data != None:
    all_review_dict[item_id] = review_data

## Convert scraped data to Dataframe

### Product data

In [None]:
# Convert product_dic to dataframe
df_products = pd.DataFrame(product_dict).T

### Review data

In [None]:
# This is a multi level dictionary, therefore, we need to restructure it to convert to dataframe

# initalize dictionary
df_dict = {}

# iterate over all products
for item_id, inner_dict in all_review_dict.items():
  # get product level data for reviews (summary)
  averageRating = inner_dict['summary'].get('averageRating')
  count = inner_dict['summary'].get('count')
  currentCount = inner_dict['summary'].get('currentCount')

  # iterate over all user reviews
  for user, details in inner_dict['reviews'].items():
    # Define key as combination of (item_id, averageRating, count, currentCount, user)
    key = (item_id, averageRating, count, currentCount, user)

    # Columns should be userData and userReview data
    userData_dict = details['userData']
    userReview_dict = details['userReview']

    # append userData and userReview dictionaries
    userData_dict.update(userReview_dict)

    # assign create key values pair to dictionary
    df_dict[key] = userData_dict


In [None]:
# Convert to dataframe
df_review = pd.DataFrame(df_dict).T.reset_index()

# Rename columns which have been reset
df_review = df_review.rename(columns={"level_0": "item_id",
                        "level_1": "averageRating",
                        "level_2": "countRatings",
                        "level_3": "currentCount",
                        "level_4": "user_id"
                        })

## Fixing columns

### Function to get unique items in columns with entries as list

In [None]:
def unique_cols(df, col):
  # intialize list
  uniq_list = []
  # iterate over columns values
  for list_item in df[col].values:
    # if not nan
    if type(list_item) == list:
      for item in list_item:
        if item not in uniq_list:
          uniq_list.append(item)
  return uniq_list

### Function to convert column with list to dummy variables

In [None]:
def list_to_cols(df, col):
  #replace nans
  df[col] = df[col].fillna('NA')

  #get unique cols
  uniq_items = unique_cols(df, col)

  #create columns
  for uniq_item in uniq_items:
    df[col + '_' + uniq_item] = 0
    for i in range(df.shape[0]):
      if (type(df[col][i]) == list) & (uniq_item in df[col][i]):
        df[col + '_' + uniq_item][i] = 1
      else:
        df[col + '_' + uniq_item][i] = 0
  return df

### Convert list items to dummy variables

In [None]:
# ageRanges
df_products = list_to_cols(df_products, 'ageRanges')
df_products = df_products.drop(columns = ['ageRanges'])

# bodyTypes
df_products = list_to_cols(df_products, 'bodyTypes')
df_products = df_products.drop(columns = ['bodyTypes'])

# colors
df_products = list_to_cols(df_products, 'colors')
df_products = df_products.drop(columns = ['colors'])

# formality
df_products = list_to_cols(df_products, 'formality')
df_products = df_products.drop(columns = ['formality'])

# occasions
df_products = list_to_cols(df_products, 'occasions')
df_products = df_products.drop(columns = ['occasions'])

# embellishments
df_products = list_to_cols(df_products, 'embellishments')
df_products = df_products.drop(columns = ['embellishments'])

### Join image links to dataframes

In [None]:
df_id_link = pd.DataFrame.from_dict(id_link, orient='index')
df_id_link.columns = ['product_img_link']

# products
df_products = df_products.join(df_id_link)

# reviews
df_review = df_review.join(df_id_link, on='item_id')

# Export processed data to pickle file

## Products

In [None]:
with open('products_processed.pkl', 'wb') as f:
    pickle.dump(df_products, f)

# save to drive
link = 'https://drive.google.com/open?id=11mLhhzlzjCeArB4oE0F957IgaxHdG5et'
_, id = link.split("=")

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': id}]})
file.SetContentFile('products_processed.pkl')
file.Upload() 

## Reviews

In [None]:
with open('reviews_processed.pkl', 'wb') as f:
    pickle.dump(df_review, f)

# save to drive
link = 'https://drive.google.com/open?id=11mLhhzlzjCeArB4oE0F957IgaxHdG5et'
_, id = link.split("=")

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': id}]})
file.SetContentFile('reviews_processed.pkl')
file.Upload() 