In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, NavigableString, Tag

In [2]:
user_agent = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"}

## Amazon

In [3]:
amazon_review_dict = {'rating':[], 'review':[]}

for num in range(1,160):
    url = 'https://www.amazon.com/Oculus-Quest-All-Gaming-System-PC/product-reviews/B07HNW68ZC/ref=cm_cr_arp_d_viewopt_srt?ie=UTF8&reviewerType=all_reviews&sortBy=recent&pageNumber={}'.format(num)
    response = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(response.content, 'lxml')
    for reviews in soup.find_all('div', {'data-hook':'review'}):
        amazon_review_dict['rating'].append(reviews.find('i', attrs={"data-hook": "review-star-rating"}).text)
        amazon_review_dict['review'].append(reviews.find('span',{'data-hook':'review-body'}).text)

In [4]:
amazon_df = pd.DataFrame(amazon_review_dict)

In [5]:
amazon_df['vendor'] = 'Amazon'

## BestBuy

In [6]:
bb_review_dict = {'rating':[], 'review':[]}
for num in range(1,100):
    url = 'https://www.bestbuy.com/site/reviews/oculus-quest-all-in-one-vr-gaming-headset-64gb-black/6342914?variant=A&page={}'.format(num)
    response = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(response.content, 'lxml')
    for reviews in soup.find_all('div', class_='c-ratings-reviews-v2 v-small'):
        bb_review_dict['rating'].append(reviews.find('p', class_='sr-only').text)
    for reviews in soup.find_all('div', class_='ugc-review-body body-copy-lg'):
        bb_review_dict['review'].append(reviews.find('p', class_='pre-white-space').text)

In [7]:
bestbuy_df = pd.DataFrame(bb_review_dict)

In [8]:
bestbuy_df['vendor'] = 'BestBuy'

## WalMart

In [9]:
rating = []
review = []
for num in range(1,17):
    url = 'https://www.walmart.com/reviews/product/472031416?sort=submission-desc&page={}'.format(num)
    response = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(response.content, 'lxml')
    for reviews in soup.find_all('span', class_='average-rating'):
        rating.append(reviews.find('span', class_='visuallyhidden seo-avg-rating').text)
    for reviews in soup.find_all('div', class_='review-text'):
        review.append(reviews.find('p').text)

In [17]:
del(rating[195:])

In [19]:
wm = {'rating':rating, 'review':review}

In [20]:
wm_df = pd.DataFrame(wm)

In [21]:
wm_df['vendor'] = 'Walmart'

### Data Wrangling

In [22]:
amazon_df['rating'] = [x[0] for x in amazon_df.rating]

In [23]:
bestbuy_df['rating'] = [x.split()[1] for x in bestbuy_df['rating']]

In [24]:
wm_df['rating'] = [x[0] for x in wm_df.rating]

In [25]:
amazon_df['review'] = [x.strip() for x in amazon_df['review']]

In [26]:
bestbuy_df['review'] = [x.strip() for x in bestbuy_df['review']]

In [27]:
wm_df['review'] = [x.strip() for x in wm_df['review']]

In [28]:
ocquest_reviews = pd.concat([amazon_df, bestbuy_df, wm_df]).reset_index(drop=True)

- https://github.com/amueller/word_cloud
- https://www.datacamp.com/community/tutorials/wordcloud-python
- https://towardsdatascience.com/web-scraping-metacritic-reviews-using-beautifulsoup-63801bbe200e

In [29]:
ocquest_reviews

Unnamed: 0,rating,review,vendor
0,5,"The product is suitable for description, excel...",Amazon
1,5,Like the games different types for different p...,Amazon
2,4,This is a great headset. Running tethered to a...,Amazon
3,3,"Pros:Game play, when the game (HL Alyx) actual...",Amazon
4,5,Came 2 weeks early! And works so smooth!Cant t...,Amazon
...,...,...,...
3760,5,The hype is real! This is THE console to have ...,Walmart
3761,5,Awesome when it works. After setting it up an...,Walmart
3762,5,"This is, hands down, the best virtual reality ...",Walmart
3763,5,Its a great device if you have never tried VR ...,Walmart


In [38]:
ocquest_reviews.rating = ocquest_reviews.rating.astype('int')

In [41]:
ocquest_reviews.rating.value_counts()

5    3043
4     425
1     118
3     101
2      78
Name: rating, dtype: int64

In [42]:
ocquest_reviews.to_csv('quest_reviews.csv', index=False)