# Titre du Projet: Recuperation des donnees des ordinateurs sur Amazon
## Objectif: L'obectif principal est de loader les donnees des ordinateurs sur Amazon et d'en faire un ficher cvs ou Excel pour une analyse des ventes apres.

## Prmiere etape: Nous avons utliser bs4 et wibdriver pour extraire les donnees des ordinateurs sur Amazon
## Deuxieme etape: Stocker les donnees des ordinateurs extraites dans un fichier csv.

* Nous avons creer une classe Amazon et des methodes a l'interieur de cette classe pour faire le travail
* Pour ne pas trop se fatiguer avec les navigateur, nous avons stocker 08 liens de produit que nous pouvons extraire leur donnees
* Mais il faut retenir que cette liste de liens peut etre dynamiser selon nos souhaits avec lke nombre de ligne que nous voulons.



#### Merci pour cette opportunites #####

In [17]:
import time
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pprint
import csv


class AmazonProduct:
    def __init__(self, output_file="products.csv"):
        self.output_file = output_file
        self.initialize_csv()

    def initialize_csv(self):
        """Crée le fichier CSV avec les en-têtes s'il n'existe pas encore."""
        try:
            with open(self.output_file, mode='x', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow(["product_url", "product_title", "price", "product_rating",
                                 "nb_reviews", "nb_bought", "hard_size", "cpu_model"])
        except FileExistsError:
            pass

    # mozila = webdriver.Firefox(),

    def get_urls(self, driver=None, nb_products=100):
        """_description_
        Args:
            driver (webdriver/selenium): votre connecteur a selenium pour aller sur le site
            search_term (str): le terme de recherche dans notre cas c'est "Ordinateur Portable"
            nb_products (int): le nombre de produits qu'on veut extraire. Par defaut c'est 100
        Returns:
            nb_linsk (list): retourne une liste des liens des products qui sont sur la page
        """
        # Configuration de WebDriver (remplace le chemin par celui de ton WebDriver)

        # Ouvrir Amazon
        driver.get("https://www.amazon.fr")

        # Rechercher un produit (remplace "ordinateur portable" par ton terme de recherche)
        search_box = driver.find_element(By.ID, "twotabsearchtextbox")
        search_box.send_keys("ordinateur portable")
        search_box.send_keys(Keys.RETURN)

        # Attendre que la page se charge
        time.sleep(30)

        # Liste pour stocker les liens des produits
        product_links = []

        # Boucle pour extraire les liens des produits
        while len(product_links) < nb_products:
            # Trouver tous les éléments de produits sur la page
            products = driver.find_elements(By.CLASS_NAME, "a-link-normal")

            # Extraire les liens et les ajouter à la liste
            for product in products:
                link = product.get_attribute("href")
                if link not in product_links:
                    product_links.append(link)
                    if len(product_links) >= nb_products:
                        break

            # Passer à la page suivante si on n'a pas encore 100 liens
            if len(product_links) < nb_products:
                next_button = driver.find_element(
                    By.CLASS_NAME, "s-pagination-item s-pagination-next s-pagination-button s-pagination-button-accessibility s-pagination-separator")
                next_button.click()
                time.sleep(30)  # Attendre que la page se charge

        # Fermer le navigateur
        driver.quit()

        return product_links

    def get_product_title(self, soup):
        try:
            title = soup.find(
                "span", attrs={"id": "productTitle"}).get_text().strip()
            return title
        except:
            return None

    def get_product_price(self, soup):
        try:
            price = soup.find(
                "span", {"class": "a-price-whole"}).get_text().strip()
            return float(price)
        except:
            return None

    def get_product_rating(self, soup):
        div_avg = soup.find("div", {"id": "averageCustomerReviews"})
        if div_avg is not None:
            rating = div_avg.find(
                "span", {"class": "a-icon-alt"}).get_text().split(" ")[0]
            return float(rating)
        else:
            return None

    def get_number_reviews(self, soup):
        try:
            nb_reviews = soup.find(
                "span", {"id": "acrCustomerReviewText"}).get_text().split(" ")[0]
            return int(nb_reviews)
        except:
            return None

    def get_number_of_bought_per_month(self, soup):
        try:
            div_bought = soup.find(
                "div", {"class": "a-section a-spacing-micro social-proofing-faceout"})
            if div_bought is not None:
                nb_bought = div_bought.find(
                    "span", {"class": "a-text-bold"}).get_text().split(" ")[0]
                return nb_bought

            else:
                return None
        except:
            return None

    def get_hard_size(self, soup):
        try:
            div_caracter = soup.find(
                'div', {'class': 'a-section a-spacing-small a-spacing-top-small'})
            if div_caracter is not None:
                caracter = div_caracter.find(
                    'tr', {'class': 'a-spacing-small po-hard_disk.size'})
                hard_size = caracter.get_text().strip()
                return hard_size
            else:
                return None
        except:
            return None

    def get_pcu(self, soup):
        div_caracter = soup.find(
            'div', {'class': 'a-section a-spacing-small a-spacing-top-small'})
        if div_caracter is not None:
            caracter = div_caracter.find(
                'tr', {'class': 'a-spacing-small po-cpu_model.family'})
            cpu_moel = caracter.get_text().strip()
            return cpu_moel
        else:
            return None

    def get_product_data(self, product_url):
        response = requests.get(product_url)
        # print(response.status_code)
        if response.status_code == 200:
            soup = bs(response.content, 'html.parser')
            product_title = self.get_product_title(soup)
            product_rating = self.get_product_rating(soup)
            nb_reviews = self.get_number_reviews(soup)
            nb_bought = self.get_number_of_bought_per_month(soup)
            cpu_model = self.get_pcu(soup)
            hard_size = self.get_hard_size(soup)
            price = self.get_product_price(soup)
            return {
                "product_url": product_url,
                "product_title": product_title,
                "price": price,
                "product_rating": product_rating,
                "nb_reviews": nb_reviews,
                "nb_bought": nb_bought,
                "hard_size": hard_size,
                "cpu_model": cpu_model
            }

        else:
            return None

    def save_to_csv(self, data):
        """Ajoute les données d'un produit au fichier CSV."""
        if data:
            with open(self.output_file, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.DictWriter(file, fieldnames=["product_url", "product_title", "price",
                                                          "product_rating", "nb_reviews",                 "nb_bought", "hard_size", "cpu_model"])
                writer.writerow(data)

    def scrap_url(self, product_urls):
        for url in product_urls:
            product_data = self.get_product_data(url)
            pprint.pprint(product_data)
            print("==========================================================")
            # self.save_to_csv(product_data)


In [18]:
# Maintenant, je vais ouvrir chaque lien et extraire les informations du produit

bot = AmazonProduct()
#product_urls= bot.get_product_urls("ordinateur", 1000) #Cela extrait automatiquement les URL des produits de la page de recherche Amazon

product_urls = ["https://www.amazon.com/HP-Micro-edge-Microsoft-14-dq0040nr-Snowflake/dp/B0947BJ67M/ref=sr_1_3?crid=OFZFV8HT9BTC&dib=eyJ2IjoiMSJ9.fIWDkDG_dBxqFi8DwUMyw81lh9hU4SJEEnDBfIlL6FlovTASoD4YLwn98bbXz-t0tu8rUUMVOKtaNk2QmhcbYe_rXjPWo8jYybNDJKhvNnNr0uEoiTiWMgjkUUVEFYU0Li4D-CVj5YC6xFX2EcRcxN3W0r-60gMglfUPQkRcA5KbgS29L3l9QqZ1BNxqLiU9TmXd5as4cRnw1t4sBvCB6ajFoFSH6nuiJYy73JeyIa4.UNNABZ_HBvFytww9ypSquGUwAavaMfftRcKvgvvW8mc&dib_tag=se&keywords=ordinateur&qid=1740869127&sprefix=ordinat%2Caps%2C542&sr=8-3",
                "https://www.amazon.com/acer-Gateway-Chromebook-CBO311-1H-C1MX-802-11ac/dp/B0D33NC672/ref=sr_1_4?crid=OFZFV8HT9BTC&dib=eyJ2IjoiMSJ9.fIWDkDG_dBxqFi8DwUMyw81lh9hU4SJEEnDBfIlL6FlovTASoD4YLwn98bbXz-t0tu8rUUMVOKtaNk2QmhcbYe_rXjPWo8jYybNDJKhvNnNr0uEoiTiWMgjkUUVEFYU0Li4D-CVj5YC6xFX2EcRcxN3W0r-60gMglfUPQkRcA5KbgS29L3l9QqZ1BNxqLiU9TmXd5as4cRnw1t4sBvCB6ajFoFSH6nuiJYy73JeyIa4.UNNABZ_HBvFytww9ypSquGUwAavaMfftRcKvgvvW8mc&dib_tag=se&keywords=ordinateur&qid=1740869127&sprefix=ordinat%2Caps%2C542&sr=8-4&th=1",
                "https://www.amazon.com/Lenovo-IdeaPad-Chromebook-Anti-GlareTouchscreen-MediaTek/dp/B0DGD1699V/ref=sxin_16_pa_sp_search_thematic_sspa?content-id=amzn1.sym.194cd35a-8547-4b5c-b8fa-823869beb7b0%3Aamzn1.sym.194cd35a-8547-4b5c-b8fa-823869beb7b0&crid=OFZFV8HT9BTC&cv_ct_cx=ordinateur&keywords=ordinateur&pd_rd_i=B0DGD1699V&pd_rd_r=8a2faea9-320e-47b0-98fb-7f4760bb7a64&pd_rd_w=S6r7O&pd_rd_wg=81EmG&pf_rd_p=194cd35a-8547-4b5c-b8fa-823869beb7b0&pf_rd_r=10MQFD504TQSZ4QNQ0JS&qid=1740869127&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=ordinat%2Caps%2C542&sr=1-3-2c727eeb-987f-452f-86bd-c2978cc9d8b9-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9zZWFyY2hfdGhlbWF0aWM&th=1",
                "https://www.amazon.com/HP-Students-Business-Quad-Core-Storage/dp/B0B2D77YB8/ref=sxin_16_pa_sp_search_thematic_sspa?content-id=amzn1.sym.194cd35a-8547-4b5c-b8fa-823869beb7b0%3Aamzn1.sym.194cd35a-8547-4b5c-b8fa-823869beb7b0&crid=OFZFV8HT9BTC&cv_ct_cx=ordinateur&keywords=ordinateur&pd_rd_i=B0B2D77YB8&pd_rd_r=8a2faea9-320e-47b0-98fb-7f4760bb7a64&pd_rd_w=S6r7O&pd_rd_wg=81EmG&pf_rd_p=194cd35a-8547-4b5c-b8fa-823869beb7b0&pf_rd_r=10MQFD504TQSZ4QNQ0JS&qid=1740869127&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=ordinat%2Caps%2C542&sr=1-2-2c727eeb-987f-452f-86bd-c2978cc9d8b9-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9zZWFyY2hfdGhlbWF0aWM&th=1",
                "https://www.amazon.com/Lenovo-Ideapad-Chromebook-Graphics-Keyboard/dp/B0D7VM7CJZ/ref=sxin_16_pa_sp_search_thematic_sspa?content-id=amzn1.sym.194cd35a-8547-4b5c-b8fa-823869beb7b0%3Aamzn1.sym.194cd35a-8547-4b5c-b8fa-823869beb7b0&crid=OFZFV8HT9BTC&cv_ct_cx=ordinateur&keywords=ordinateur&pd_rd_i=B0D7VM7CJZ&pd_rd_r=8a2faea9-320e-47b0-98fb-7f4760bb7a64&pd_rd_w=S6r7O&pd_rd_wg=81EmG&pf_rd_p=194cd35a-8547-4b5c-b8fa-823869beb7b0&pf_rd_r=10MQFD504TQSZ4QNQ0JS&qid=1740869127&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=ordinat%2Caps%2C542&sr=1-4-2c727eeb-987f-452f-86bd-c2978cc9d8b9-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9zZWFyY2hfdGhlbWF0aWM&th=1",
                "https://www.amazon.com/HP-Stream-BrightView-N4120-Graphics/dp/B0CZL2SLCJ/ref=sr_1_6?crid=OFZFV8HT9BTC&dib=eyJ2IjoiMSJ9.fIWDkDG_dBxqFi8DwUMyw81lh9hU4SJEEnDBfIlL6FlovTASoD4YLwn98bbXz-t0tu8rUUMVOKtaNk2QmhcbYe_rXjPWo8jYybNDJKhvNnNr0uEoiTiWMgjkUUVEFYU0Li4D-CVj5YC6xFX2EcRcxN3W0r-60gMglfUPQkRcA5KbgS29L3l9QqZ1BNxqLiU9TmXd5as4cRnw1t4sBvCB6ajFoFSH6nuiJYy73JeyIa4.UNNABZ_HBvFytww9ypSquGUwAavaMfftRcKvgvvW8mc&dib_tag=se&keywords=ordinateur&qid=1740869127&sprefix=ordinat%2Caps%2C542&sr=8-6&th=1",
                "https://www.amazon.com/HP-Students-Business-Quad-Core-Storage/dp/B0B2D77YB8/ref=sr_1_7_sspa?crid=OFZFV8HT9BTC&dib=eyJ2IjoiMSJ9.fIWDkDG_dBxqFi8DwUMyw81lh9hU4SJEEnDBfIlL6FlovTASoD4YLwn98bbXz-t0tu8rUUMVOKtaNk2QmhcbYe_rXjPWo8jYybNDJKhvNnNr0uEoiTiWMgjkUUVEFYU0Li4D-CVj5YC6xFX2EcRcxN3W0r-60gMglfUPQkRcA5KbgS29L3l9QqZ1BNxqLiU9TmXd5as4cRnw1t4sBvCB6ajFoFSH6nuiJYy73JeyIa4.UNNABZ_HBvFytww9ypSquGUwAavaMfftRcKvgvvW8mc&dib_tag=se&keywords=ordinateur&qid=1740869127&sprefix=ordinat%2Caps%2C542&sr=8-7-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9tdGY&th=1",
                "https://www.amazon.com/Lenovo-11-6-300e-Chromebook-Touchscreen/dp/B07RJZMC49/ref=sr_1_8?crid=OFZFV8HT9BTC&dib=eyJ2IjoiMSJ9.fIWDkDG_dBxqFi8DwUMyw81lh9hU4SJEEnDBfIlL6FlovTASoD4YLwn98bbXz-t0tu8rUUMVOKtaNk2QmhcbYe_rXjPWo8jYybNDJKhvNnNr0uEoiTiWMgjkUUVEFYU0Li4D-CVj5YC6xFX2EcRcxN3W0r-60gMglfUPQkRcA5KbgS29L3l9QqZ1BNxqLiU9TmXd5as4cRnw1t4sBvCB6ajFoFSH6nuiJYy73JeyIa4.UNNABZ_HBvFytww9ypSquGUwAavaMfftRcKvgvvW8mc&dib_tag=se&keywords=ordinateur&qid=1740869127&sprefix=ordinat%2Caps%2C542&sr=8-8"
                ]

bot.scrap_url(product_urls)

{'cpu_model': 'CPU Model   Celeron N4020',
 'hard_size': 'Hard Disk Size   64 GB',
 'nb_bought': '10K+',
 'nb_reviews': None,
 'price': 146.0,
 'product_rating': 4.1,
 'product_title': 'HP 14 Laptop, Intel Celeron N4020, 4 GB RAM, 64 GB Storage, '
                  '14-inch Micro-edge HD Display, Windows 11 Home, Thin & '
                  'Portable, 4K Graphics, One Year of Microsoft 365 '
                  '(14-dq0040nr, Snowflake White)',
 'product_url': 'https://www.amazon.com/HP-Micro-edge-Microsoft-14-dq0040nr-Snowflake/dp/B0947BJ67M/ref=sr_1_3?crid=OFZFV8HT9BTC&dib=eyJ2IjoiMSJ9.fIWDkDG_dBxqFi8DwUMyw81lh9hU4SJEEnDBfIlL6FlovTASoD4YLwn98bbXz-t0tu8rUUMVOKtaNk2QmhcbYe_rXjPWo8jYybNDJKhvNnNr0uEoiTiWMgjkUUVEFYU0Li4D-CVj5YC6xFX2EcRcxN3W0r-60gMglfUPQkRcA5KbgS29L3l9QqZ1BNxqLiU9TmXd5as4cRnw1t4sBvCB6ajFoFSH6nuiJYy73JeyIa4.UNNABZ_HBvFytww9ypSquGUwAavaMfftRcKvgvvW8mc&dib_tag=se&keywords=ordinateur&qid=1740869127&sprefix=ordinat%2Caps%2C542&sr=8-3'}
{'cpu_model': 'CPU Model   Celeron',
 'hard_s