In [None]:
#
# # Required Libraries
#

# E
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# T
import pandas as pd
import re

# R
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

In [None]:
#############################################################################################
currentFileVersion = '0.0.1' 
print("{} ETL functions def file version v{} {}\n".format("*"*30, currentFileVersion, '*'*30))
#############################################################################################

In [None]:

def extract_phone_details_selenium(url, main_content_selector, brand_selector, model_family_selector, star_rating_selector, rating_value_selector, num_pages):
    """
    Extracts the phone details from the specified URL using the specified CSS selectors and locator strategies.
    
    Args:
    url (str): The URL of the web page to extract phone details from.
    main_content_selector (str): The CSS selector for the main content-holding element on the web page.
    brand_selector (str): The CSS selector for the element containing the phone brand name.
    model_family_selector (str): The CSS selector for the element containing the phone model family.
    star_rating_selector (str): The CSS selector for the element containing the phone star rating.
    rating_value_selector (str): The CSS selector for the element containing the phone rating value.
    upfront_cost_selector (str): The CSS selector for the element containing the phone upfront cost.
    upfront_pence_selector (str): The CSS selector for the element containing the pence value of the phone upfront cost.
    monthly_cost_val_selector (str): The CSS selector for the element containing the phone monthly cost value.
    monthly_pence_selector (str): The CSS selector for the element containing the pence value of the phone monthly cost.
    
    Returns:
    list: A list of dictionaries containing the extracted phone details.
    """
    # Create a new Chrome browser instance
    driver = webdriver.Chrome()

    for page_num in range(1, num_pages+1):
        # construct the URL for the current page
        page_url = f"{url}&page={page_num}"
        print(page_url)
        # Navigate to the page
        driver.get(url)

        # Wait for the page to load
        driver.implicitly_wait(10)

        # Find all main content-holding elements on the page
        main_contents = driver.find_elements(By.CSS_SELECTOR, main_content_selector)

        # Extract the phone details from each main content-holding element
        phone_details_list = []
        for main_content in main_contents:
            # Extract the brand name
            try:
                brand = main_content.find_element(By.CSS_SELECTOR, brand_selector).text
            except KeyError:
                brand = None
            if brand is None:
                continue

            # Extract the device model family
            try:
                model_family = main_content.find_element(By.CSS_SELECTOR, model_family_selector).text
            except KeyError:
                model_family = None
            if model_family is None:
                continue

            # Extract the star rating
            try:
                star_rating = main_content.find_element(By.CSS_SELECTOR, star_rating_selector).get_attribute("aria-label")
            except KeyError:
                star_rating = None
            if star_rating is None:
                continue

            # Extract the rating value
            try:
                rating_value = main_content.find_element(By.CSS_SELECTOR, rating_value_selector).text
            except KeyError:
                rating_value = None
            if rating_value is None:
                continue

            # # Wait for the upfront cost element to be visible and extract its value
            # upfront_cost_elem = WebDriverWait(main_content, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, upfront_cost_selector)))
            
            # upfront_cost_elem = main_content.find_element(By.CSS_SELECTOR, upfront_cost_selector).text
            # upfront_cost = upfront_cost_elem.text.strip()



            # Add the extracted data to the phone details list
            phone_details_list.append({
                "Brand": brand,
                "Model Family": model_family,
                "Star Rating": star_rating,
                "Rating Value": rating_value
                #"Upfront Cost": upfront_cost

            })

    # Close the browser instance
    driver.quit()

    # Return the extracted data as a list of dictionaries
    return phone_details_list



def extract_phone_details(url, main_content_selector, brand_selector, model_family_selector, star_rating_selector, rating_value_selector, num_pages):
    """
    Extracts the phone details from the specified URL using the specified CSS selectors and locator strategies.
    
    Args:
    url (str): The URL of the web page to extract phone details from.
    main_content_selector (str): The CSS selector for the main content-holding element on the web page.
    brand_selector (str): The CSS selector for the element containing the phone brand name.
    model_family_selector (str): The CSS selector for the element containing the phone model family.
    star_rating_selector (str): The CSS selector for the element containing the phone star rating.
    rating_value_selector (str): The CSS selector for the element containing the phone rating value.
    num_pages (int): The number of pages to scrape.
    
    Returns:
    list: A list of dictionaries containing the extracted phone details.
    """
    phone_details_list = []
    for page_num in range(1, num_pages+1):
        # construct the URL for the current page
        page_url = f"{url}&page={page_num}"
        #print(page_url)
        
        # make the HTTP request and get the HTML content
        response = requests.get(page_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # find all main content-holding elements on the page
        main_contents = soup.select(main_content_selector)
        
        # extract the phone details from each main content-holding element
        for main_content in main_contents:
            # extract the brand name
            brand = main_content.select_one(brand_selector).text.strip()

            # extract the device model family
            model_family = main_content.select_one(model_family_selector).text.strip()

            # extract the star rating
            star_rating = main_content.select_one(star_rating_selector)
            if star_rating:
                star_rating = star_rating.get('aria-label')
            else:
                star_rating = None

            # extract the rating value
            rating_value = main_content.select_one(rating_value_selector).text.strip()

            # add the extracted data to the phone details list
            phone_details_list.append({
                "Brand": brand,
                "Model Family": model_family,
                "Star Rating": star_rating,
                "Rating Value": rating_value
            })
    
    return phone_details_list




https://www.o2.co.uk/shop/phones#sort=content.sorting.featured&page=&page=1
https://www.o2.co.uk/shop/phones#sort=content.sorting.featured&page=&page=2
https://www.o2.co.uk/shop/phones#sort=content.sorting.featured&page=&page=3


NameError: name 'cnx' is not defined

#### Pipeline reporting

In [30]:

def identify_outliers(phone_details_list, max_rating_value, max_model_family_length, special_chars=[]):
    """
    Identifies outliers in the phone details list based on various criteria.
    
    Args:
    phone_details_list (list): A list of dictionaries containing phone details.
    max_rating_value (float): The maximum allowable rating value. Default is 5.
    max_model_family_length (int): The maximum allowable model family length. Default is 50.
    special_chars (list): A list of special characters to check for in the model family text. Default is an empty list.
    
    Returns:
    list: A list of dictionaries containing the outlier phone details.
    """
    outlier_phone_details_list = []
    
    for phone_details in phone_details_list:
        # check for a rating value greater than the maximum allowable
        rating_value = float(phone_details['Rating Value'])
        if rating_value > max_rating_value:
            outlier_phone_details_list.append(phone_details)
        
        # check for a model family length greater than the maximum allowable
        model_family = phone_details['Model Family']
        if len(model_family) > max_model_family_length:
            outlier_phone_details_list.append(phone_details)
        
        # check for the presence of special characters in the model family text
        if special_chars:
            pattern = f"[{''.join(special_chars)}]"
            if re.search(pattern, model_family):
                outlier_phone_details_list.append(phone_details)
    
    return outlier_phone_details_list



def time_extract_phone_details():
    urlBase = "https://www.o2.co.uk/shop/phones"
    urlSub = "#sort=content.sorting.featured&page="

    main_content_selector = ".device-info-content"
    brand_selector = '[data-qa-device-title] [data-qa-device-brand]'
    model_family_selector = '[data-qa-device-title] [data-qa-device-modelfamily]'
    star_rating_selector = ".star-rating__stars"
    rating_value_selector = ".device-rating__text [itemprop='ratingValue']"
    num_pages = 3

    return extract_phone_details(
        urlBase+urlSub,
        main_content_selector,
        brand_selector,
        model_family_selector,
        star_rating_selector,
        rating_value_selector,
        num_pages
    )

[{'Brand': 'Apple', 'Model Family': 'iPhone SE 3rd Generation', 'Star Rating': None, 'Rating Value': '4.5357'}, {'Brand': 'Apple', 'Model Family': 'iPhone SE 3rd Generation', 'Star Rating': None, 'Rating Value': '4.5357'}, {'Brand': 'Apple', 'Model Family': 'iPhone SE 3rd Generation', 'Star Rating': None, 'Rating Value': '4.5357'}]


### Represent

In [6]:

def save_data_to_file(filename, data, file_type='csv', file_path='./'):
    """
    Saves the data to a file in the specified file type and file path.
    
    Args:
    filename (str): The name of the file to save.
    data (list): A list of dictionaries containing the data to save.
    file_type (str): The file type to save the data in (defaults to csv).
    file_path (str): The path to save the file in (defaults to current directory).
    """
    print("Exporting data to file on path : {}{}{}".format(file_path,filename,file_type))
    if file_type == 'csv':
        df = pd.DataFrame(data)
        df.to_csv(f"{file_path}/{filename}.csv", index=False)
    elif file_type == 'json':
        with open(f"{file_path}/{filename}.json", 'w') as f:
            json.dump(data, f)
    else:
        print("Invalid file type. Only CSV and JSON are supported.")




def plot_average_rating_by_brand(phone_details_list):
    # convert phone_details_list to a pandas DataFrame
    df = pd.DataFrame(phone_details_list)
    df['Rating Value'] = pd.to_numeric(df['Rating Value'])

    # create a list of unique brands
    brands = df['Brand'].unique()

    # create a dictionary of random colors for each brand
    colors = {brand: sns.color_palette("Set2", len(brands))[i] for i, brand in enumerate(brands)}

    # create a bar plot with Seaborn
    fig, ax = plt.subplots(figsize=(10, 3))
    sns.barplot(x="Brand", y="Rating Value", data=df, errcolor="grey", capsize=0.05, palette=colors, ax=ax)

    # set plot title and axis labels
    ax.set_title("Average Rating Value by Brand")
    ax.set_xlabel("Brand")
    ax.set_ylabel("Rating Value")

    # add angled x-axis labels
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

    # add value labels to the bars
    for p in ax.patches:
        ax.annotate("{:.2f}".format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='center', xytext=(0, 10), textcoords='offset points')

    # adjust plot margins to ensure labels within each bar are displayed within the main plot area
    plt.margins(y=0.1)

    # display the plot
    plt.show()



def create_phone_details_table(conn):
    """
    Creates a phone_details table in an in-memory SQLite database.
    """
    c = conn.cursor()
    c.execute('''CREATE TABLE phone_details
                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
                  brand TEXT,
                  model_family TEXT,
                  star_rating TEXT,
                  rating_value TEXT)''')
    conn.commit()



def insert_phone_details(conn, phone_details_list):
    """
    Inserts the phone details into the phone_details table in the in-memory SQLite database.
    
    Args:
    conn (sqlite3.Connection): SQLite database connection.
    phone_details_list (list): A list of dictionaries containing the extracted phone details.
    """
    c = conn.cursor()
    for phone_details in phone_details_list:
        c.execute("INSERT INTO phone_details (brand, model_family, star_rating, rating_value) VALUES (?, ?, ?, ?)",
                  (phone_details['Brand'], phone_details['Model Family'], phone_details['Star Rating'], phone_details['Rating Value']))
    conn.commit()