In [2]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time




In [5]:
driver = webdriver.Safari()
driver.get('https://www.amazon.co.uk/')  

try:
    search_box = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, 'twotabsearchtextbox'))
    )
    search_box.send_keys('swimwear')
    search_box.send_keys(Keys.RETURN)
except Exception as e:
    print("An error occurred: ", e)
    driver.quit()

product_list = []
for page in range(1, 6):  
    time.sleep(2) 
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    products = soup.find_all('div', {'data-component-type': 's-search-result'})
    
    for product in products:
        title = product.h2.text if product.h2 else 'N/A'
        url = 'https://www.amazon.co.uk' + product.h2.a['href'] if product.h2 and product.h2.a else 'N/A'
        try:
            price = product.find('span', 'a-offscreen').text
        except AttributeError:
            price = 'N/A'
        try:
            rating = product.find('span', 'a-icon-alt').text
        except AttributeError:
            rating = 'N/A'
        try:
            review_count = product.find('span', {'class': 'a-size-base'}).text
        except AttributeError:
            review_count = 'N/A'
        
        product_list.append([title, url, price, rating, review_count])
    
    try:
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//a[@class="s-pagination-item s-pagination-next"]'))
        )
        next_button.click()
    except Exception as e:
        print("An error occurred: ", e)
        break  
driver.quit()

df = pd.DataFrame(product_list, columns=['Title', 'URL', 'Price', 'Rating', 'ReviewCount'])
df.to_csv('amazon_products.csv', index=False)

df.head()


An error occurred:  Message: 



Unnamed: 0,Title,URL,Price,Rating,ReviewCount
0,Zoggs,,£38.98,4.5 out of 5 stars,1040
1,Harry Bear,,£12.99,5.0 out of 5 stars,1
2,Bsrpolry,,£19.99,4.1 out of 5 stars,28
3,iCKER,,£19.99,4.4 out of 5 stars,196
4,DOULAFASS,,£21.99,4.2 out of 5 stars,1148


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load Data
df = pd.read_csv('amazon_products.csv')

# Clean Data
df['Price'] = df['Price'].replace('[£,\$]', '', regex=True).astype(float)
df['Rating'] = df['Rating'].str.extract('(\d+\.\d+)').astype(float)

df['ReviewCount'] = df['ReviewCount'].replace('[^0-9]', '', regex=True)
df['ReviewCount'] = pd.to_numeric(df['ReviewCount'], errors='coerce').fillna(0).astype(int)
np.random.seed(42)
df['EstimatedSales'] = np.random.randint(1, 1000, size=len(df))

# Select Features and Target Variable
features = ['Price', 'Rating', 'ReviewCount']
X = df[features]
y = df['EstimatedSales']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R²: {r2}')

# Predictions
df['SalesPrediction'] = model.predict(X.fillna(0))
joblib.dump(model, 'sales_estimation_model.pkl')


Mean Squared Error: 144590.01006508333
R²: -0.24420340922117711


['sales_estimation_model.pkl']

In [13]:
df

Unnamed: 0,Title,URL,Price,Rating,ReviewCount,EstimatedSales,SalesPrediction
0,Zoggs,,38.98,4.5,1040,103,672.33
1,Harry Bear,,12.99,5.0,1,436,429.83
2,Bsrpolry,,19.99,4.1,28,861,724.94
3,iCKER,,19.99,4.4,196,271,387.91
4,DOULAFASS,,21.99,4.2,1148,107,195.85
5,CUPSHE,,29.99,4.5,30704,72,373.141
6,GRACE KARIN,,26.99,4.7,82,701,532.83
7,CUPSHE,,29.99,4.3,10145,21,168.06
8,CUPSHE,,29.99,4.4,14332,615,464.987
9,Joweechy,,23.99,4.6,605,122,323.09


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load Data
df = pd.read_csv('amazon_products.csv')

# Clean Data
df['Price'] = df['Price'].replace('[£,\$]', '', regex=True).astype(float)
df['Rating'] = df['Rating'].str.extract('(\d+\.\d+)').astype(float)

# Clean 'ReviewCount' by extracting only numeric values and setting invalid entries to NaN
df['ReviewCount'] = df['ReviewCount'].replace('[^0-9]', '', regex=True)
df['ReviewCount'] = pd.to_numeric(df['ReviewCount'], errors='coerce').fillna(0).astype(int)

# Generate a dummy 'Estimated Sales' column for demonstration
np.random.seed(42)
df['EstimatedSales'] = np.random.randint(1, 1000, size=len(df))

# Select Features and Target Variable
features = ['Price', 'Rating', 'ReviewCount']
X = df[features]
y = df['EstimatedSales']
X = X.fillna(0)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R²: {r2}')

joblib.dump(best_model, 'sales_estimation_model.pkl')


Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Mean Squared Error: 120236.94765624336
R²: -0.03464423386448434


['sales_estimation_model.pkl']

In [16]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from bs4 import BeautifulSoup

driver = webdriver.Safari()
driver.get('https://www.amazon.co.uk/') 

try:
    search_box = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, 'twotabsearchtextbox'))
    )
    search_box.send_keys('swimwear')
    search_box.send_keys(Keys.RETURN)
except Exception as e:
    print("An error occurred: ", e)
    driver.quit()

product_list = []

# Loop through the first 6 pages
for page in range(1, 6):  
    time.sleep(2) 
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    products = soup.find_all('div', {'data-component-type': 's-search-result'})
    
    for product in products:
        title = product.h2.text if product.h2 else 'N/A'
        url = 'https://www.amazon.co.uk' + product.h2.a['href'] if product.h2 and product.h2.a else 'N/A'
        try:
            price = product.find('span', 'a-offscreen').text
        except AttributeError:
            price = 'N/A'
        try:
            rating = product.find('span', 'a-icon-alt').text
        except AttributeError:
            rating = 'N/A'
        try:
            review_count = product.find('span', {'class': 'a-size-base'}).text
        except AttributeError:
            review_count = 'N/A'
        
        # Check if 'Units Sold' information is available
        units_sold_elem = product.find('span', string=lambda text: 'bought in past month' in str(text).lower())
        if units_sold_elem:
            units_sold = units_sold_elem.text.strip()
        else:
            units_sold = 'N/A'
        
        product_list.append([title, url, price, rating, review_count, units_sold])
    
    # Run to the next page
    try:
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//a[@class="s-pagination-item s-pagination-next"]'))
        )
        next_button.click()
    except Exception as e:
        print("An error occurred: ", e)
        break  # No more pages to navigate

driver.quit()

df = pd.DataFrame(product_list, columns=['Title', 'URL', 'Price', 'Rating', 'ReviewCount', 'UnitsSold'])
df.to_csv('amazon_products.csv', index=False)
df.head()


An error occurred:  Message: 



Unnamed: 0,Title,URL,Price,Rating,ReviewCount,UnitsSold
0,AMAGGIGO,,£22.09,4.3 out of 5 stars,1119,
1,Sam Caan,,£12.99,4.2 out of 5 stars,18,
2,Sixyotie,,£26.98,4.3 out of 5 stars,35,
3,OAMENXI,,£22.94,4.3 out of 5 stars,910,
4,DOULAFASS,,£21.99,4.2 out of 5 stars,1148,
