In [25]:
from bs4 import BeautifulSoup
from datetime import datetime
from sqlalchemy import create_engine
import requests
import psycopg2
import numpy as np
import pandas as pd


#Objective: Friend wants to break into gaming mouse market and wants to feature their line of products onto Newegg. 
#Wants to know the details before making decision
#Wants to have a steady stream of data into database to have firsthand information
#Load onto PostgreSQL database

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
#Requesting url

# URL for page 1 (without the "/Page-{}" suffix)
base_url_page1 = "https://www.newegg.com/global/sg-en/Gaming-Mice/SubCategory/ID-3527"

# Base URL for pages 2 to 6
base_url_other_pages = "https://www.newegg.com/global/sg-en/Gaming-Mice/SubCategory/ID-3527/Page-{}"

#Lists for storing data
name_list = []
price_list = []
rating_list = []
brand_list = []
dpi_list = []
model_number_list = []
hand_orientation_list = []

# Loop through pages 1 to 6
for page_num in range(1, 7):
    if page_num == 1:
        url = base_url_page1
    else:
        url = base_url_other_pages.format(page_num)
    
    response = requests.get(url)
    
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
        
    # Extract product names
    product_names = soup.find_all('a', class_='item-title')
    for name in product_names:
        name_list.append(name.text)

    # Extract prices
    price_containers = soup.find_all('li', class_='price-current')
    for price_container in price_containers:
        main_price = price_container.find('strong').text
        cents = price_container.find('sup').text
        full_price = main_price + cents
        price_list.append(full_price)

    # Extract product ratings. Each rating is in the second position (1st index) in the aria label.
    rating_containers = soup.find_all('a', class_='item-rating')
    for rating_container in rating_containers:
        i_class_tag = rating_container.find('i')
        aria_label = i_class_tag.get('aria-label')
        rating = aria_label.split(' ')[1].strip()
        rating_list.append(rating)

    # Extract brand
    brand_containers = soup.find_all('a', class_='item-brand')
    for brand_container in brand_containers:
        img_src_tag = brand_container.find('img')
        brand = img_src_tag.get('title')
        brand_list.append(brand)

    # Extract maximum dpi
    item_containers = soup.find_all('ul', class_='item-features')
    for item_container in item_containers:
        list_items = item_container.find_all('li') #There are multiple li items in item_features
        for item in list_items:
            if 'Maximum dpi:' in item.text:
                maximum_dpi_text = item.text
                maximum_dpi_number = maximum_dpi_text.split(':')[-1].strip() #Split text by colon :, then take the last part which is the max dpi
                dpi_list.append(maximum_dpi_number)

    # Extract Model #
    for item_container in item_containers:
        list_items = item_container.find_all('li')
        for item in list_items:
            if 'Model #:' in item.text:
                model_number_text = item.text
                model_number = model_number_text.split(':')[-1].strip()
                model_number_list.append(model_number)
            
    # Extract Hand Orientation
    for item_container in item_containers:
        list_items = item_container.find_all('li')
        for item in list_items:
            if 'Hand Orientation:' in item.text:
                hand_orientation_text = item.text
                hand_orientation = hand_orientation_text.split(':')[-1].strip()
                hand_orientation_list.append(hand_orientation)

In [3]:
#rating_list, brand_list, dpi_list, hand_orientation are shorter 
#Some products do not have their respective elements shown, so we will need to assign the missing elements as NA
print(len(name_list))
print(len(price_list))
print(len(rating_list))
print(len(brand_list))
print(len(dpi_list))
print(len(model_number_list))
print(len(hand_orientation_list))

216
216
108
94
130
216
84


In [4]:
#Create empty dicts
rating_dict = {}
brand_dict = {}
dpi_dict = {}
model_number_dict = {}
hand_orientation_dict = {}

#Create tuples made up of (name, rating) / (name, brand) / (name, dpi) / (name, model_number_list) / (name, hand_orientation)
rating_tuple = zip(name_list, rating_list)
brand_tuple = zip(name_list, brand_list)
dpi_tuple = zip(name_list, dpi_list)
model_number_tuple = zip(name_list, model_number_list)
hand_orientation_tuple = zip(name_list, hand_orientation_list)

#Create dicts to match rating/brand/dpi/hand_orientation to the respective product
for name, rating in rating_tuple:
    rating_dict[name] = rating
for name, brand in brand_tuple:
    brand_dict[name] = brand
for name, dpi in dpi_tuple:
    dpi_dict[name] = dpi
for name, model in model_number_tuple:
    model_number_dict[name] = model
for name, hand_orientation in hand_orientation_tuple:
    hand_orientation_dict[name] = hand_orientation

#Nest these dicts in lists again, this time with the respective NAs if the product does not have rating/brand/dpi/hand_orientation
products = []

#Using product names as the unique identifier (i.e. index), loop over each feature for every index in name_list
for i in range(len(name_list)):
    product_dict = {
        'name': name_list[i],
        'price': price_list[i],
        'rating': rating_dict.get(name_list[i], np.nan),  # Get rating or NaN if not found
        'brand': brand_dict.get(name_list[i], np.nan),    # Get brand or NaN if not found
        'dpi': dpi_dict.get(name_list[i], np.nan),
        'model': model_number_list[i],
        'hand_orientation': hand_orientation_dict.get(name_list[i], np.nan)
    }

    #Append the product_dict into an empty product list for loading into pandas later
    products.append(product_dict)


In [5]:
df = pd.DataFrame(products)

#Create a scrape_datetime column. down to seconds only
df['scrape_datetime'] = datetime.now()
df['scrape_datetime'] = pd.to_datetime(df['scrape_datetime']).dt.floor('s')

In [6]:
#For brands with NA, extract from name

#Function: If 'brand' is 'NA', replace it with the first word from 'name' column. Otherwise, return the original brand
def update_brand(row):
    if row['brand'] == 'NA':
       return row['name'].split()[0]
    else:
        return row['brand']
        
#Apply to each row of DataFrame
df['brand'] = df.apply(update_brand, axis=1)

In [7]:
#Checking if there are other keyboards in 'name' column
#Drop ID 187, 188 & 189 since they are keyboards
df[df['name'].str.contains('keyboard|Keyboard|board')]

Unnamed: 0,name,price,rating,brand,dpi,model,hand_orientation,scrape_datetime
187,"ROCCAT Kone AIMO Remastered PC Gaming Mouse, Optical, RGB Backlit Lighting, 23 Programmable Keys, Onboard Memory, Palm Grip, Owl Eye Sensor, Ergonomic, LED Illumination, 16,000 DPI, Black",183.14,,,,ROC-11-820-BK,,2024-09-16 20:44:17


In [8]:
#Rename similar brands to the original spelling
df['brand'] = df['brand'].replace({
    'COOLER': 'Cooler Master',
    'cooler': 'Cooler Master',
    'Cooler': 'Cooler Master',
    'Dell': 'DELL',
    'Roccat': 'ROCCAT',
    'roccat': 'ROCCAT',
    'Asus': 'ASUS',
    'Mad' : 'Mad Catz',
    'MAD' : 'Mad Catz',
    'The' : 'Mad Catz'
})

In [9]:
#Check for null rows
df.isnull().sum()

name                  0
price                 0
rating              108
brand               122
dpi                  86
model                 0
hand_orientation    132
scrape_datetime       0
dtype: int64

In [10]:
#Check for duplicate rows, based on 'name' column only
df[df.duplicated(subset='name')]

Unnamed: 0,name,price,rating,brand,dpi,model,hand_orientation,scrape_datetime
197,"ROCCAT Kone XP PC Gaming Mouse with 3D AIMO RGB Lighting, 19K DPI Optical Sensor, 4D Krystal Scroll Wheel, Multi-Button Design, Wired Computer Mouse, Black, (ROC-11-420-01)",135.92,,,,41802720575664,,2024-09-16 20:44:17


In [11]:
#Checking for duplicates
#id 93 and 134 - exactly the same product. Drop id 134
df[df['name'] == 'swiftpoint penpoint ergonomic mouse & office health software | scientific vertical pen grip | reduce muscle strain, injury, carpal tunnel, tendonitis & tennis elbow | wireless, blu']
#id 144 and 145 - exactly the same product. Drop id 145
df[df['name'] == '3Dconnexion CadMouse Compact - Mouse - ergonomic - optical - 7 buttons - wireless, wired - USB, Bluetooth, 2.4 GHz']
df.drop([134,145])

Unnamed: 0,name,price,rating,brand,dpi,model,hand_orientation,scrape_datetime
0,"Logitech G203 Wired Gaming Mouse, 8,000 DPI, Rainbow Optical Effect LIGHTSYNC RGB, 6 Programmable Buttons, On-Board Memory, Screen Mapping, PC/Mac Computer and Laptop Compatible - Black",48.59,4.3,Logitech,8000 dpi,910-005790,Right Hand,2024-09-16 20:44:17
1,"Logitech G PRO X SUPERLIGHT Wireless Gaming Mouse, Ultra-Lightweight, HERO 25K Sensor, 25,600 DPI, 5 Programmable Buttons, Long Battery Life, Compatible with PC / Mac - Black",180.35,4.4,Logitech,25600 dpi,910-005878,Right Hand,2024-09-16 20:44:17
2,"EVGA X17 Gaming Mouse, Wired, Black, Customizable, 16,000 DPI, 5 Profiles, 10 Buttons, Ergonomic 903-W1-17BK-KR",28.07,3.9,ASUS,16000 dpi,903-W1-17BK-KR,Both Hands,2024-09-16 20:44:17
3,"ASUS ROG Keris II WL Ace (54-gram ergonomic mouse, 42000-dpi, ROG Micro Switch, SpeedNova wireless technology, ROG Polling Rate Booster, 4000 Hz in wireless mode, up to 8000 Hz in wired mode) - Black",179.27,4.2,Corsair,42000 dpi,90MP03N0-BMUA00,Right Hand,2024-09-16 20:44:17
4,"CORSAIR KATAR PRO XT Ultra-Light Gaming Mouse, CH-930C111-NA",52.91,4.3,Logitech,18000 dpi,CH-930C111-NA,Right Hand,2024-09-16 20:44:17
5,"Logitech G903 LIGHTSPEED Wireless Gaming Mouse W/ Hero 25K Sensor, PowerPlay Compatible, 140+ Hour with Rechargeable Battery and Lightsync RGB, Ambidextrous, 107G+10G optional, 25,600 DPI, Black",179.27,3.8,ASUS,12000 dpi,910-005670,Right Hand,2024-09-16 20:44:17
6,"ASUS ROG Keris II WL Ace (54-gram ergonomic mouse, 42000-dpi, ROG Micro Switch, SpeedNova wireless technology, ROG Polling Rate Booster, 4000 Hz in wireless mode, up to 8000 Hz in wired mode) - White",236.51,4.2,Logitech,42000 dpi,90MP03N0-BMUA10,Right Hand,2024-09-16 20:44:17
7,"Logitech G502 Lightspeed Wireless Gaming Mouse with Hero 25K Sensor, PowerPlay Compatible, Tunable Weights and Lightsync RGB - Black",150.11,4.2,Corsair,25600 dpi,910-005565,Right Hand,2024-09-16 20:44:17
8,"Corsair SCIMITAR RGB ELITE CH-9304211-NA Black 17 Buttons 1 x Wheel USB 2.0 Type-A Wired Optical MOBA/MMO Gaming Mouse, Backlit RGB LED",90.71,3.8,ASUS,18000 dpi,CH-9304211-NA,Right Hand,2024-09-16 20:44:17
9,"ASUS ROG Gladius III Wired Gaming Mouse | Tuned 19,000 DPI Sensor, Hot Swappable Push-Fit II Switches, Ergo Shape, ROG Omni Mouse Feet, ROG Paracord and Aura Sync RGB Lighting",74.51,4.0,Logitech,19000 dpi,90MP0270-BMUA00,Both Hands,2024-09-16 20:44:17


In [12]:
#id 167 and 168 - prices, model number are different. However, rename 167 and 168
df[df['name'] == 'ROCCAT Kone XP PC Gaming Mouse with 3D AIMO RGB Lighting, 19K DPI Optical Sensor, 4D Krystal Scroll Wheel, Multi-Button Design, Wired Computer Mouse, Black, (ROC-11-420-01)']
df.loc[167, 'name'] = df.loc[167, 'name'].replace('ROCCAT Kone XP PC Gaming Mouse with 3D AIMO RGB Lighting, 19K DPI Optical Sensor, 4D Krystal Scroll Wheel, Multi-Button Design, Wired Computer Mouse, Black, (ROC-11-420-01)', 'ROCCAT Kone XP PC Gaming Mouse with 3D AIMO RGB Lighting, 19K DPI Optical Sensor, 4D Krystal Scroll Wheel, Multi-Button Design, Wired Computer Mouse, Black, (ROC-11-420-01) (Product 1)')
df.loc[168, 'name'] = df.loc[168, 'name'].replace('ROCCAT Kone XP PC Gaming Mouse with 3D AIMO RGB Lighting, 19K DPI Optical Sensor, 4D Krystal Scroll Wheel, Multi-Button Design, Wired Computer Mouse, Black, (ROC-11-420-01)', 'ROCCAT Kone XP PC Gaming Mouse with 3D AIMO RGB Lighting, 19K DPI Optical Sensor, 4D Krystal Scroll Wheel, Multi-Button Design, Wired Computer Mouse, Black, (ROC-11-420-01) (Product 2)')

In [13]:
#Reset index and remove the old index
df.reset_index(drop=True, inplace=True)

#Create a new id column
df['id'] = range(1, len(df)+1)
df.set_index('id', inplace=True)

In [14]:
#Change all df columns to snake_case format (except datetime column) for easier loading into database

df_columns_to_adjust = df.drop(columns=['scrape_datetime','price','rating','model'])
    
df_columns_to_adjust = df_columns_to_adjust.apply(lambda x: x.str.lower().str.replace(r'[\s\W]+', '_', regex=True))

#Concat the columns back to df, along column axis
df = pd.concat([df_columns_to_adjust, df[['scrape_datetime', 'price', 'rating', 'model']]], axis=1)


In [15]:
#Shift column positions
new_column_order = ['name', 'brand', 'price', 'rating', 'dpi', 'hand_orientation', 'model', 'scrape_datetime']
df_new = df[new_column_order]

In [31]:
#Final DataFrame
df_new

Unnamed: 0_level_0,name,brand,price,rating,dpi,hand_orientation,model,scrape_datetime
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,logitech_g203_wired_gaming_mouse_8_000_dpi_rainbow_optical_effect_lightsync_rgb_6_programmable_buttons_on_board_memory_screen_mapping_pc_mac_computer_and_laptop_compatible_black,logitech,48.59,4.3,8000_dpi,right_hand,910-005790,2024-09-16 20:44:17
2,logitech_g_pro_x_superlight_wireless_gaming_mouse_ultra_lightweight_hero_25k_sensor_25_600_dpi_5_programmable_buttons_long_battery_life_compatible_with_pc_mac_black,logitech,180.35,4.4,25600_dpi,right_hand,910-005878,2024-09-16 20:44:17
3,evga_x17_gaming_mouse_wired_black_customizable_16_000_dpi_5_profiles_10_buttons_ergonomic_903_w1_17bk_kr,asus,28.07,3.9,16000_dpi,both_hands,903-W1-17BK-KR,2024-09-16 20:44:17
4,asus_rog_keris_ii_wl_ace_54_gram_ergonomic_mouse_42000_dpi_rog_micro_switch_speednova_wireless_technology_rog_polling_rate_booster_4000_hz_in_wireless_mode_up_to_8000_hz_in_wired_mode_black,corsair,179.27,4.2,42000_dpi,right_hand,90MP03N0-BMUA00,2024-09-16 20:44:17
5,corsair_katar_pro_xt_ultra_light_gaming_mouse_ch_930c111_na,logitech,52.91,4.3,18000_dpi,right_hand,CH-930C111-NA,2024-09-16 20:44:17
6,logitech_g903_lightspeed_wireless_gaming_mouse_w_hero_25k_sensor_powerplay_compatible_140_hour_with_rechargeable_battery_and_lightsync_rgb_ambidextrous_107g_10g_optional_25_600_dpi_black,asus,179.27,3.8,12000_dpi,right_hand,910-005670,2024-09-16 20:44:17
7,asus_rog_keris_ii_wl_ace_54_gram_ergonomic_mouse_42000_dpi_rog_micro_switch_speednova_wireless_technology_rog_polling_rate_booster_4000_hz_in_wireless_mode_up_to_8000_hz_in_wired_mode_white,logitech,236.51,4.2,42000_dpi,right_hand,90MP03N0-BMUA10,2024-09-16 20:44:17
8,logitech_g502_lightspeed_wireless_gaming_mouse_with_hero_25k_sensor_powerplay_compatible_tunable_weights_and_lightsync_rgb_black,corsair,150.11,4.2,25600_dpi,right_hand,910-005565,2024-09-16 20:44:17
9,corsair_scimitar_rgb_elite_ch_9304211_na_black_17_buttons_1_x_wheel_usb_2_0_type_a_wired_optical_moba_mmo_gaming_mouse_backlit_rgb_led,asus,90.71,3.8,18000_dpi,right_hand,CH-9304211-NA,2024-09-16 20:44:17
10,asus_rog_gladius_iii_wired_gaming_mouse_tuned_19_000_dpi_sensor_hot_swappable_push_fit_ii_switches_ergo_shape_rog_omni_mouse_feet_rog_paracord_and_aura_sync_rgb_lighting,logitech,74.51,4.0,19000_dpi,both_hands,90MP0270-BMUA00,2024-09-16 20:44:17


In [17]:
#Basic info of DataFrame. Do note the scrape_datetime is in datetime64 format
print(df_new.dtypes)

name                        object
brand                       object
price                       object
rating                      object
dpi                         object
hand_orientation            object
model                       object
scrape_datetime     datetime64[us]
dtype: object


In [18]:
#Final table definition using markdown

In [36]:
# Connect to the default `postgres` database
conn = psycopg2.connect(
    dbname='postgres',
    user='postgres',
    password='tiankaik95',
    host='localhost',
    port=5432
)
conn.autocommit = True
cursor = conn.cursor()

# Create the `etl` database first
cursor.execute("CREATE DATABASE etl;")
conn.close()

In [34]:
# PostgreSQL connection string for `etl` database
connection_string = 'postgresql://postgres:tiankaik95@localhost:5432/etl'
db = create_engine(connection_string)

# Connect to the `etl` database using psycopg2
conn1 = psycopg2.connect(
    database="etl",
    user='postgres',
    password='tiankaik95',
    host='localhost',
    port=5432
)
conn1.autocommit = True
cursor = conn1.cursor()

# Table query with constraints added
query_etl = """
    CREATE TABLE IF NOT EXISTS etl (
        id SERIAL PRIMARY KEY,
        name TEXT NOT NULL,
        brand TEXT,
        price REAL CHECK (price >= 0),
        rating REAL CHECK (rating BETWEEN 0 and 5),
        dpi TEXT,
        hand_orientation TEXT,
        model TEXT,
        scrape_datetime TIMESTAMP
    )
"""
cursor.execute(query_etl)
conn1.commit()
conn1.close()

# Append df_new onto the etl table
df_new.to_sql('etl', db, if_exists='append', index=False)

216