In [77]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests 
import csv
import re

# Scraping Refrigerator Information

In [78]:
def get_fridge_details():
    fridge_url = f"https://www.partselect.com/Fridge-Parts.htm" 
    page = requests.get(fridge_url)
    soup = BeautifulSoup(page.content, "html.parser")
    popular_parts_section = soup.find('h2', class_='section-title', string='Popular Fridge Parts')

    image_titles = popular_parts_section.find_all_next('img', alt=True)
    parts = popular_parts_section.find_all_next('div', class_='nf__part__detail')
    price_parts_div = popular_parts_section.find_all_next('div', class_='nf__part__left-col__basic-info__price')
    stock_parts_div = popular_parts_section.find_all_next('div', class_='nf__part__left-col__basic-info__stock')

    # Check if the part number in the title matches the manufacturer number
    pattern = r'Part Number: (\w+)'
    data = []

    for i, (img, part, prices, stocks) in enumerate(zip(image_titles, parts, price_parts_div, stock_parts_div), start=1):
        print(i, ". Image Title:", img['alt'])
        match = re.search(pattern, img['alt'])

        if match:
            part_number_from_title = match.group(1)
            part_number = part.find('div', class_='nf__part__detail__part-number').strong.text
            manufacturer_number = part.find_all('div', class_='nf__part__detail__part-number')[1].strong.text
            detail_title = part.find('a', class_='nf__part__detail__title').text.strip()
            if part_number_from_title == manufacturer_number:
                explanation_div = part.find('div', class_='nf__part__detail__symptoms')
                explanation = explanation_div.previous_sibling.strip() if explanation_div else "No explanation available"
                url = part.find('a', class_='nf__part__detail__title')['href']
                price = prices.find('div',class_='mt-sm-2 price').get_text(strip=True)
                stock = stocks.find('div',class_='mb-1 mb-sm-2 js-tooltip').get_text(strip=True)
                data.append([part_number, manufacturer_number, url.split('-')[1], detail_title, explanation, price, stock, url])

    with open('/Users/sarah_prakriti_peters/Documents/Instalily/data/fridge_parts.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Part_Number', 'Manufacturer_Part_Number', 'Manufacturer', 'Title', 'Description', 'Price', 'Stock', 'Product_link'])
        writer.writerows(data)

In [79]:
get_fridge_details()

1 . Image Title: Refrigerator Ice and Water Filter – Part Number: EDR1RXD1
2 . Image Title: Refrigerator Door Shelf Bin – Part Number: 242126602
3 . Image Title: BELT DRIVE – Part Number: WE03X29897
4 . Image Title: Refrigerator Door Shelf Bin – Part Number: WPW10321304
5 . Image Title: Oven Bake Element – Part Number: 316075103
6 . Image Title: Door Shelf Retainer Bar – Part Number: 240534901
7 . Image Title: Refrigerator Water Filter – Part Number: EDR4RXD1
8 . Image Title: Refrigerator Crisper Drawer with Humidity Control – Part Number: WP2188656
9 . Image Title: Drum Bearing Slide - White (Sold individually) – Part Number: WE1M504
10 . Image Title: Drum Bearing Slide - Green (Sold individually) – Part Number: WE1M1067


In [80]:
def pull_product_specs(link, mpn):
    product_url = f"https://www.partselect.com{link}"
    product_page = requests.get(product_url)
    soup = BeautifulSoup(product_page.content, "html.parser")

    # Troubleshooting Section 
    # Contains information about what kinds of problems this part solves, which products it is compatible with and what it can replace
    troubleshooting_section = soup.find("div", {"id": "Troubleshooting"})

    if troubleshooting_section:
        symptoms_fixed_element = troubleshooting_section.find_next("div", class_="bold", string="This part fixes the following symptoms:")
        if symptoms_fixed_element:
            symptoms_fixed = symptoms_fixed_element.next_sibling.strip()
        else:
            symptoms_fixed = "Symptoms information not found"  
    
        replacements = soup.find("div", class_="bold", string=f"Part# {mpn} replaces these:").find_next_sibling("div")
        if replacements:
            replacements_list = [r.strip() for r in replacements.get_text(strip=True).split(',')]
        else:
            replacements_list = ["Not found"]

    product_info_div = soup.find("div", class_="bold mb-1", string="This part works with the following products:")
    product_info = product_info_div.find_next("div").text.strip().replace("This part works with the following products:", "").strip()

    # Customer Reviews Section 
    # What is the average product rating? 
    # How easy is it to install?
    average_product_rating = soup.find('div', class_='pd__cust-review__header__rating__chart--border').get_text(strip=True)
    # num_reviews = soup.find('span', class_='rating__count').get_text().strip()
    average_customer_rating = soup.find('div', class_='pd__repair-story__raiting mt-2 mb-4').get_text(strip=True).replace("What's this?","").strip()
    average_repair_rating = average_customer_rating.replace("Average Repair Rating:","").strip()
    # print(symptoms_fixed, replacements_list, product_info, average_product_rating, average_repair_rating)
    
    return symptoms_fixed, replacements_list, product_info, average_product_rating, num_reviews, average_repair_rating


In [81]:
symptoms_fixed_list = []
replacements_list_list = []
product_info_list = []
average_product_rating_list = []
num_reviews_list = []
average_customer_rating_list = []
mpns = []

df = pd.read_csv("/Users/sarah_prakriti_peters/Documents/Instalily/data/fridge_parts.csv")

# Iterate over each row in the DataFrame and call pull_product_specs function
for link, mpn in zip(df['Product_link'], df['Manufacturer_Part_Number']):
    symptoms_fixed, replacements_list, product_info, average_product_rating, num_reviews, average_customer_rating = pull_product_specs(link, mpn)
    
    # Append return values to lists
    mpns.append(mpn)
    symptoms_fixed_list.append(symptoms_fixed)
    replacements_list_list.append(replacements_list)
    product_info_list.append(product_info)
    average_product_rating_list.append(average_product_rating)
    # num_reviews_list.append(num_reviews)
    average_customer_rating_list.append(average_customer_rating)

# Create a new DataFrame with the return values
data = {
    'Manufacturer_Part_Number': mpns,
    'Symptoms Fixed': symptoms_fixed_list,
    'Replacements List': replacements_list_list,
    'Product Info': product_info_list,
    'Average Product Rating': average_product_rating_list,
    # 'Number_of_reviews': num_reviews_list,
    'Average Repair Rating': average_customer_rating_list
}

fridge_result_df = pd.DataFrame(data)

In [82]:
def qna(link):
    product_url = f"https://www.partselect.com{link}"
    product_page = requests.get(product_url)
    soup = BeautifulSoup(product_page.content, "html.parser")
    questions_and_answers = soup.find('div', id='QuestionsAndAnswersContent')
    questions, answers, model_nos = [], [], []

    if questions_and_answers:
        question_elements = questions_and_answers.find_all('div', class_='qna__question')
        
        for question_element in question_elements:
            question = question_element.find(class_='js-searchKeys').text.strip()
            model_number_element = question_element.find(class_='bold mt-3 mb-3')
            if model_number_element:
                model_number = model_number_element.text.strip()
                model_number = model_number.replace("For model number","").strip()
            else:
                model_number = "Model number not found"
            answer = question_element.find(class_='qna__ps-answer__msg').text.strip()
            
            questions.append(question+ "This is for model number " + model_number)
            answers.append(answer)
            model_nos.append(model_number)

            # print(questions, answers, model_nos)

        return questions, answers, model_nos

In [83]:
questions_list, answers_list, model_numbers, mpns = [], [], [], []
# Iterate over each row in the DataFrame and call pull_product_specs function
for link, mpn in zip(df['Product_link'], df['Manufacturer_Part_Number']):
    questions, answers, model_nos = qna(link)
    
    # Append return values to lists
    questions_list.append(questions)
    answers_list.append(answers)
    model_numbers.append(model_nos)
    mpns.append(mpn)

# Create a new DataFrame with the return values
data = {
    'Manufacturer_Part_Number': mpns,
    'Queries':questions_list,
    'Answers':answers_list,
    'Corresponding_model_numbers': model_numbers
}

fridge_qanda_df = pd.DataFrame(data)

In [84]:
fridge_df = pd.read_csv("/Users/sarah_prakriti_peters/Documents/Instalily/data/fridge_parts.csv")
fridge_data = fridge_df.merge(fridge_result_df,on='Manufacturer_Part_Number').merge(fridge_qanda_df,on='Manufacturer_Part_Number')
fridge_data.shape

(10, 16)

In [85]:
fridge_data.to_csv("/Users/sarah_prakriti_peters/Documents/Instalily/data/fridge_data.csv")

In [86]:
csv_file = "/Users/sarah_prakriti_peters/Documents/Instalily/data/fridge_data.csv"
txt_file = "/Users/sarah_prakriti_peters/Documents/Instalily/data/fridge_data.txt"
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r") as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()

# Scraping Dishwasher Information

In [87]:
def get_dishwasher_details():
    dishwasher_url = f"https://www.partselect.com/Dishwasher-Parts.htm" 
    page = requests.get(dishwasher_url)
    soup = BeautifulSoup(page.content, "html.parser")
    popular_parts_section = soup.find('h2', class_='section-title', string='Popular Dishwasher Parts')

    image_titles = popular_parts_section.find_all_next('img', alt=True)
    parts = popular_parts_section.find_all_next('div', class_='nf__part__detail')
    price_parts_div = popular_parts_section.find_all_next('div', class_='nf__part__left-col__basic-info__price')
    stock_parts_div = popular_parts_section.find_all_next('div', class_='nf__part__left-col__basic-info__stock')

    # Check if the part number in the title matches the manufacturer number
    pattern = r'Part Number: (\w+)'
    data = []

    for i, (img, part, prices, stocks) in enumerate(zip(image_titles, parts, price_parts_div, stock_parts_div), start=1):
        print(i, ". Image Title:", img['alt'])
        match = re.search(pattern, img['alt'])

        if match:
            part_number_from_title = match.group(1)
            part_number = part.find('div', class_='nf__part__detail__part-number').strong.text
            manufacturer_number = part.find_all('div', class_='nf__part__detail__part-number')[1].strong.text
            detail_title = part.find('a', class_='nf__part__detail__title').text.strip()
            if part_number_from_title == manufacturer_number:
                explanation_div = part.find('div', class_='nf__part__detail__symptoms')
                explanation = explanation_div.previous_sibling.strip() if explanation_div else "No explanation available"
                price = prices.find('div',class_='mt-sm-2 price').get_text(strip=True)
                stock = stocks.find('div',class_='mb-1 mb-sm-2 js-tooltip').get_text(strip=True)
                url = part.find('a', class_='nf__part__detail__title')['href']
                data.append([part_number, manufacturer_number, url.split('-')[1], detail_title, explanation, price, stock, url])

    with open('/Users/sarah_prakriti_peters/Documents/Instalily/data/dishwasher_parts.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Part_Number', 'Manufacturer_Part_Number', 'Manufacturer', 'Title', 'Description', 'Price', 'Stock','Product_link'])
        writer.writerows(data)

In [88]:
get_dishwasher_details()

1 . Image Title: Upper Rack Adjuster Kit - White Wheels, Left and Right Sides – Part Number: W10712395
2 . Image Title: Rack Track Stop – Part Number: WP8565925
3 . Image Title: Lower Dishrack Wheel – Part Number: W10195416
4 . Image Title: BELT DRIVE – Part Number: WE03X29897
5 . Image Title: Dishwasher Upper Rack Adjuster – Part Number: WPW10546503
6 . Image Title: Lower Spray Arm – Part Number: 5304517203
7 . Image Title: LOWER RACK ROLLER – Part Number: WD12X26146
8 . Image Title: Refrigerator Water Filter – Part Number: RPWFE
9 . Image Title: LOWER DISHRACK WHEEL ASSEMBLY – Part Number: WPW10195417
10 . Image Title: Refrigerator Ice and Water Filter – Part Number: MWFP


In [89]:
dishwasher_df = pd.read_csv("/Users/sarah_prakriti_peters/Documents/Instalily/data/dishwasher_parts.csv")

In [90]:
symptoms_fixed_list = []
replacements_list_list = []
product_info_list = []
average_product_rating_list = []
average_customer_rating_list = []
mpns = []

# Iterate over each row in the DataFrame and call pull_product_specs function
for link, mpn in zip(dishwasher_df['Product_link'], dishwasher_df['Manufacturer_Part_Number']):
    symptoms_fixed, replacements_list, product_info, average_product_rating, num_reviews, average_customer_rating = pull_product_specs(link, mpn)
    
    # Append return values to lists
    mpns.append(mpn)
    symptoms_fixed_list.append(symptoms_fixed)
    replacements_list_list.append(replacements_list)
    product_info_list.append(product_info)
    average_product_rating_list.append(average_product_rating)
    # num_reviews_list.append(num_reviews)
    average_customer_rating_list.append(average_customer_rating)

# Create a new DataFrame with the return values
data = {
    'Manufacturer_Part_Number': mpns,
    'Symptoms Fixed': symptoms_fixed_list,
    'Replacements List': replacements_list_list,
    'Product Info': product_info_list,
    'Average Product Rating': average_product_rating_list,
    # 'Number_of_reviews': num_reviews_list,
    'Average Repair Rating': average_customer_rating_list
}

dishwasher_result_df = pd.DataFrame(data) 

In [91]:
questions_list, answers_list, model_numbers, mpns = [], [], [], []
# Iterate over each row in the DataFrame and call pull_product_specs function
for link, mpn in zip(dishwasher_df['Product_link'], dishwasher_df['Manufacturer_Part_Number']):
    questions, answers, model_nos = qna(link)
    
    # Append return values to lists
    questions_list.append(questions)
    answers_list.append(answers)
    model_numbers.append(model_nos)
    mpns.append(mpn)

# Create a new DataFrame with the return values
data = {
    'Manufacturer_Part_Number': mpns,
    'Queries':questions_list,
    'Answers':answers_list,
    'Corresponding_model_numbers': model_numbers
}

qanda_dishwasher_df = pd.DataFrame(data)

In [92]:
dishwasher_data = dishwasher_df.merge(dishwasher_result_df,on='Manufacturer_Part_Number').merge(qanda_dishwasher_df,on='Manufacturer_Part_Number')
dishwasher_data.shape

(10, 16)

In [93]:
dishwasher_data.to_csv("/Users/sarah_prakriti_peters/Documents/Instalily/data/dishwasher_data.csv")

In [94]:
csv_file = "/Users/sarah_prakriti_peters/Documents/Instalily/data/dishwasher_data.csv"
txt_file = "/Users/sarah_prakriti_peters/Documents/Instalily/data/dishwasher_parts.txt"
with open(txt_file, "w") as my_output_file:
    with open(csv_file, "r") as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()