**IMPORTING NECESSARY LIBRARIES AND MODULES**

In [1]:
from bs4 import BeautifulSoup
import requests
import time
import smtplib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import psycopg2

**FUNCTIONS TO PERFORM WEB-SCRAPING FROM THE AMAZON WEBPAGE**

In [2]:
#GETTING PRODUCT TITLES
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id":"productTitle"}).text.strip()
    except:
        title = "Unknown Product"
    
    return title      


In [3]:
#GETTING PRODUCT PRICES
def get_price(soup):
    try:
        price_whole = pd.to_numeric(soup.find("span", attrs={"class":"a-price-whole"}).text.strip())
        price_fraction = pd.to_numeric(soup.find("span", attrs={"class":"a-price-fraction"}).text.strip())
        price = price_whole + price_fraction
    except:
        price = "Unknown Price"
    
    return price

In [4]:
#GETTING PRODUCT RATINGS
def get_rating(soup):
    try:
        rating = pd.to_numeric(soup.find("span", attrs={"class":"a-icon-alt"}).text.strip().split()[0])
    except:
        rating = "Unknown Rating"
    
    return rating

In [5]:
#PERFORMING WEB SCRAPING WITH THE REQUESTS LIBRARY

URL = 'https://www.amazon.com/s?k=t-shirt&crid=1TZ2ZOV4HJN09&sprefix=%2Caps%2C186&ref=nb_sb_ss_recent_3_0_recent'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36","Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
links = soup.find_all("a", attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

links_list = []

for link in links:
    links_list.append(link.get('href'))
    
data = {"title":[], "price":[], "rating":[]}

for link in links_list:
    product_links = "https://www.amazon.com" + link
    product_page = requests.get(product_links, headers=headers)
    product_soup = BeautifulSoup(product_page.content, "html.parser")
    data["title"].append(get_title(product_soup))
    data["price"].append(get_price(product_soup))
    data["rating"].append(get_rating(product_soup))

**LOADING DATAS INTO A PANDAS DATAFRAME, REMOVING UNKNOWN PRODUCTS**

In [6]:
df = pd.DataFrame.from_dict(data)
df['rating'] = df['rating'].replace('Unknown Rating', np.nan)
df['rating'] = df['rating'].astype(float)
df['price'] = df['price'].replace('Unknown Price', np.nan)
df['price'] = df['price'].astype(float)
df = df.dropna()
df = df.sort_values(by=['rating'], ascending=False)
df = df.reset_index(drop=True)
df

Unnamed: 0,title,price,rating


**LOADING DATA INTO A LOCAL POSTGRESQL DATABASE**

In [7]:
#CREATING CONNECTTION AND CURSOR TO POSTGRESQL DATABASE
def connect():
    def create_connection():
        conn = psycopg2.connect(
            host="localhost",
            database="database",
            user="postgres",
            password="Quockhanh2004@"
        )
        return conn
    conn = create_connection()
    cur = conn.cursor()
    return cur, conn
cur, conn = connect()

In [None]:
#LOADING DATA TO POSTGRESQL DATABASE
def load(data):
    cur, conn = connect()
    cur.execute("DROP TABLE IF EXISTS tshirts")
    cur.execute("CREATE TABLE tshirts (title TEXT, price FLOAT, rating FLOAT)
    i = 0
    for item in data:
        cur.execute("INSERT INTO tshirts (title, price, rating) VALUES (%s, %s, %s)", (item['price'][i], item['price'][i], item['rating'][i]))
        i += 1
    conn.commit()
    print("Data loaded successfully")

database = load(data)