In [None]:
# -----------------------------------------------------------------------------------------------------------------------
# Documentation 
# -----------------------------------------------------------------------------------------------------------------------

#  ---------------- PROGRAM INFORMATION ---------------------------------------------------------------------------------
# Name: Product Review Webscraping Program
# Author: Sufiyan Syed
# Date: 09/22/2024

#  ---------------- PURPOSE ---------------------------------------------------------------------------------------------
# This program is designed to scrape product reviews from an Amazon product page using a specified URL input.
# It extracts details such as the reviewer name, star rating, review title, review date, purchase verification,
# and the review text. The extracted reviews are stored in a list and saved as a CSV file for further analysis.

#  ---------------- SELECTS ---------------------------------------------------------------------------------------------
# This program uses the url of the product reviews webpage as the input for this program. 

#  ---------------- NOTES -----------------------------------------------------------------------------------------------
# The program only extracts the reviews present on the url's webpage. Pagination of the reviews webpage must be 
# done manually with seperate URL inputs. 

#  ---------------- DEPENDENCIES --------------------------------------------------------------------------------
# Load Packages 
from urllib.request import Request, urlopen # Request and urlopen are used to send a request to the URL and open the 
                                            # webpage for scraping. 

from bs4 import BeautifulSoup # BeautifulSoup is used to parse the HTML content of the webpage and extract specific 
                              # data elements (e.g., reviews, ratings, titles).

import pandas as pd # pandas is used to create and manipulate a DataFrame, which holds the extracted review data,
                    # and to save it as a CSV file.

import os # get current working directory 

from datetime import datetime # get current date 

#  ---------------- DATA ---------------------------------------------------------------------------------------
# Output directories 
savdir1 = os.getcwd()

# Output files
savfil1 = os.path.join(savdir1, "productname_category_star_#.csv")

#  ---------------- ENVIORNMENT ----------------------------------------------------------------------------------
# Create program metadata 
filpath = os.getcwd()
prgname = os.path.splitext(os.path.basename(filpath))[0]
prgdate = datetime.now().strftime('%y%m%d')



In [None]:

# -----------------------------------------------------------------------------------------------------------------------
# Prepare function to extract reviews
# -----------------------------------------------------------------------------------------------------------------------

# Add headers
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"}

# Define function to get webpage results into Beautifulsoup object 
def get_page(url, headers):
    try:
        encoded_headers = {key: value.encode('utf-8') for key, value in headers.items()}
        req = Request(url, headers=encoded_headers)
        page = urlopen(req)
        soup = BeautifulSoup(page, "html.parser")
        return soup
    except Exception as e:
        print(f"Error occurred: {e}")
        import traceback
        traceback.print_exc()


# -----------------------------------------------------------------------------------------------------------------------
# Gather HTML from URL
# -----------------------------------------------------------------------------------------------------------------------
# Replace the url in get_page() for each page of reviews. 

# Loop until header1 is not None
while True:
    # Get html
    soup = get_page("https://www.amazon.com/Thrustmaster-Yoke-PACK-Boeing-Xbox-x/dp/B09DPYHM55/ref=sr_1_5?crid=29KZ42OY0C859&dib=eyJ2IjoiMSJ9.gQXGQTXbcMi54OeLHcBcCdy8bUzAiP_khKF2HF5sDWteRLmjwsCoEm-T8ES3RJDk_pwvrXj5zel4nKbkyOA8Kc_d9eOe7P4LdnBf0ENU7pdjnW7xTK1Qdyl53s_jPhNO8xJh635sqWj92SiLQhRkqB6v3zgWbPwxbaXYM6A05uYAoi3s7d1u3lfAfrqrdOXxUbA2DWaManb3dv3M-e02zwsWrCouonZwu86t9YX1RCs.cKc6vAHkdCKax4iiYjtUPIMLhnIJikbZP9z_UlLYEaM&dib_tag=se&keywords=thrustmaster&qid=1733025366&sprefix=thrustmas%2Caps%2C203&sr=8-5&th=1", headers=headers)

    # Get h1
    header1 = soup.find('h1')

    # Check if header1 is not None
    if header1 is not None:
        break  # Exit the loop if header1 is not None
    

# -----------------------------------------------------------------------------------------------------------------------
# Run Checks If Needed 
# -----------------------------------------------------------------------------------------------------------------------

#print(soup)
#print("--------------")
#print(header1)
#print(header1.attrs)
#product_title = header1.get_text().strip()
#print(product_title)
#all_headers = soup.find_all('h1')
#print(all_headers)
#type(all_headers)


# -----------------------------------------------------------------------------------------------------------------------
# Extract Review Data 
# -----------------------------------------------------------------------------------------------------------------------

# Create an empty list to store dictionaries for each review
reviews_list = []

review_content = soup.find_all('div', {'data-hook': 'review'})

# Extract contents
for review in review_content:
    reviewer_name = review.find('span', {'class': 'a-profile-name'}).get_text().strip()
    star_rating_tag = review.find('i', {'data-hook': ['review-star-rating', 'cmps-review-star-rating']})
    star_rating = star_rating_tag.get_text().strip() if star_rating_tag else "No Rating"
    review_title_tag = review.find('a', {'data-hook': 'review-title'})
    # If the review title is not found within an <a> tag, try finding it within a <span> tag
    if review_title_tag is None:
        review_title_tag = review.find('span', {'data-hook': 'review-title'})
    # Extract the text of the review title from the tag, if found
    if review_title_tag is not None:
        review_title = review_title_tag.get_text().strip()
    else:
        review_title = "Title not found"
    review_date = review.find('span', {'data-hook': 'review-date'}).get_text().strip()
    purchase_verification_tag = review.find('span', {'data-hook': 'avp-badge'})
    purchase_verification = purchase_verification_tag.get_text().strip() if purchase_verification_tag else ""
    review_text_spans = review.find_all('span', {'data-hook': 'review-body'})
    review_text = ' '.join([span.get_text().strip() for span in review_text_spans])

    # Append review as a dictionary to the list
    reviews_list.append({'Reviewer': reviewer_name,
                         'Star Rating': star_rating,
                         'Review Title': review_title,
                         'Review Date': review_date,
                         'Purchase Verification': purchase_verification,
                         'Review Text': review_text})


# -----------------------------------------------------------------------------------------------------------------------
# Output
# -----------------------------------------------------------------------------------------------------------------------

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(reviews_list)

# Save
df.to_csv(savfil1, index=False)

