# Explore Scrapping

In [None]:
from bs4 import BeautifulSoup
import requests
import csv
from pathlib import Path

def scrape_trustpilot_ini(company, output_file_suffix='_reviews.csv'):
    output_file = Path(f"./data/{company}{output_file_suffix}")
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["company","reviewer", "aTimeDateTime", "aTimeText", "date", "rating", "content"])

def scrape_trustpilot_reviews(company, pages_to_scrape=[2], output_file_suffix='_reviews.csv',headers=None):
    base_url = f"https://www.trustpilot.com/review/{company}"
    output_file = Path(f"./data/{company}{output_file_suffix}")
    
    try:
        response = requests.get(base_url, headers=headers)
        print(response)
        #soup = BeautifulSoup(response.content, 'html.parser')
        

        with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
            csv_writer = csv.writer(csvfile)
            
            for page in pages_to_scrape:
                print(f"Scraping page {base_url}?page={page}")
                page_response = requests.get(f"{base_url}?page={page}", headers=headers)
                if page_response.status_code == 200:            
        
                    page_soup = BeautifulSoup(page_response.content, 'html.parser')
                    reviews = page_soup.find_all('div', attrs={"data-testid": "service-review-card-v2"})
                    print(f"Page {page}: Found {len(reviews)} reviews")
                    
                    for review in reviews:
                        print (review)
                        spans=review.find_all("span")
                        print (len(spans),spans[0])
                        for span in spans:
                            print(span.text)
                        
                        #REVIEWERNAME 
                        reviewer = review.find("span", attrs={"class": lambda L: L and L.startswith('styles_consumerName__')}).text
                        
                        #TIME SECTION
                        aTimeTag=review.find("time")
                        aTimeText = aTimeTag.text
                        aTimeDateTime = aTimeTag['datetime']

                        #RATING SECTION
                        #div class="styles_reviewHeader__DzoAZ" data-service-review-rating
                        rating = review.find("div", attrs={"class": lambda L: L and L.startswith("styles_reviewHeader__")})["data-service-review-rating"]

                        #CONTENT SECTION
                        content_element = review.find("div", attrs={"class": lambda L: L and L.startswith("styles_reviewContent__")})
                        content = content_element.text if content_element else 'None'

                        #another date to check
                        date = review.find("div", attrs={"data-testid":"review-badge-date"}).text
                        
                        #WRITE A REVIEW RECORD TO CSV
                        csv_writer.writerow([company, reviewer, aTimeDateTime, aTimeText, date, rating, content])
        
        
                else:
                    print(f"‚úñÔ∏è Erreur HTTP {response.status_code}")
                    return []
                    
               
        print("Data Extraction Successful!")
    except Exception as e:
        print("An error occurred:", e, e.__traceback__.tb_lineno)




## UnitTesting ?

In [90]:
# UNITEST THE SCRAPPER WITHOUT ANY AUTHENTICATION we can run max 10 pages without authentication, after that we get blocked by trustpilot
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'accept-language': 'en-US,en;q=0.9'
    } 
scrape_trustpilot_ini("nookmart.com")
#header without authentication, we can run max 10 pages without authentication, after that we get blocked by trustpilot
scrape_trustpilot_reviews("nookmart.com", pages_to_scrape=range(9, 13), headers=headers)

An error occurred: Could not find a suitable TLS CA certificate bundle, invalid path: c:\Users\lione\Documents\GitHub\Supply-Chain-Reviews\.venv\Lib\site-packages\certifi\cacert.pem 17


In [20]:
import pandas as pd
def getReviews(company, output_file_suffix='_reviews.csv'):
    filename = f"{company}{output_file_suffix}"
    df = pd.read_csv(filename, encoding='utf-8',sep=',')
    return df

reviews_df = getReviews("nookmart.com")
display(reviews_df.head(), reviews_df.shape, reviews_df.columns, reviews_df.info())


<class 'pandas.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   reviewer       200 non-null    str  
 1   aTimeDateTime  200 non-null    str  
 2   aTimeText      200 non-null    str  
 3   date           200 non-null    str  
 4   rating         200 non-null    int64
 5   content        200 non-null    str  
dtypes: int64(1), str(5)
memory usage: 9.5 KB


Unnamed: 0,reviewer,aTimeDateTime,aTimeText,date,rating,content
0,Kamye Briggeman,2026-02-07T02:29:51.000Z,35 minutes ago,"February 6, 2026",5,"Quick & easyReasonably pricedFebruary 6, 2026U..."
1,JP,2026-02-07T01:59:38.000Z,An hour ago,"February 6, 2026",5,"Wonderful Experience!Great pricing, super fast..."
2,Emma Walker,2026-02-07T00:29:54.000Z,3 hours ago,"February 6, 2026",5,"Amazing Good item good priceFebruary 6, 2026Un..."
3,Shavena Dejesus,2026-02-06T22:34:34.000Z,5 hours ago,"February 5, 2026",5,Amazing!!Amazing!!! They‚Äôre quick and always a...
4,Sabrina Nunez,2026-02-06T21:48:15.000Z,5 hours ago,"February 6, 2026",5,Fast and politeFast and polite. did everything...


(200, 6)

Index(['reviewer', 'aTimeDateTime', 'aTimeText', 'date', 'rating', 'content'], dtype='str')

None

In [21]:
import os
import webbrowser
from dotenv import load_dotenv
import requests
from http.server import HTTPServer, BaseHTTPRequestHandler
import threading
from urllib.parse import urlparse, parse_qs, unquote  # Ajout pour le d√©codage URL

load_dotenv()

# Load OAuth credentials from .env
CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET")
REDIRECT_URI = os.getenv("GOOGLE_REDIRECT_URI", "http://localhost:8000/callback")

# Store auth code globally
auth_code = None

class CallbackHandler(BaseHTTPRequestHandler):
    def do_GET(self):
        global auth_code

        # M√©thode am√©lior√©e avec d√©codage URL
        parsed_url = urlparse(self.path)
        query_params = parse_qs(parsed_url.query)

        if 'code' in query_params:
            auth_code = unquote(query_params['code'][0])
            self.send_response(200)
            self.send_header('Content-type', 'text/html')
            self.end_headers()
            self.wfile.write(b"<h1>Authorization successful! You can close this window.</h1>")
            print(f"‚úîÔ∏è Authorization code received: {auth_code[:10]}...")
        else:
            self.send_response(400)
            self.send_header('Content-type', 'text/html')
            self.end_headers()
            self.wfile.write(b"<h1>Authorization failed! No code received.</h1>")
    
    def log_message(self, format, *args):
        pass  # Suppress log messages

def get_google_access_token():
    """Get Google access token"""
    global auth_code
    
    # Step 1: Generate authorization URL
    auth_url = f"https://accounts.google.com/o/oauth2/v2/auth?client_id={CLIENT_ID}&redirect_uri={REDIRECT_URI}&response_type=code&scope=https://www.googleapis.com/auth/userinfo.email"
    
    print("Opening browser for authorization...")
    print(f"Authorization URL: {auth_url}")
    webbrowser.open(auth_url)
    
    # Step 2: Start local server to receive callback
    server = HTTPServer(('localhost', 8000), CallbackHandler)
    server_thread = threading.Thread(target=server.handle_request)
    server_thread.daemon = True
    server_thread.start()
    server_thread.join(timeout=60)
    server.server_close()
    
    if not auth_code:
        print("‚úñÔ∏è Authorization failed: No auth code received")
        return None
    
    # Step 3: Exchange auth code for access token
    token_url = "https://oauth2.googleapis.com/token"
    token_data = {
        'code': auth_code,
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
        'redirect_uri': REDIRECT_URI,
        'grant_type': 'authorization_code'
    }
    
    response = requests.post(token_url, data=token_data)
    
    if response.status_code == 200:
        token_info = response.json()
        access_token = token_info['access_token']
        print(f"‚úîÔ∏è Access token obtained! (expires in {token_info.get('expires_in')} seconds)")
        return access_token
    else:
        print(f"‚úñÔ∏è Token exchange failed: {response.text}")
        return None



In [22]:
# UNITEST OAuth
# Get the access token
access_token = get_google_access_token()

if access_token:
    # Use token for authenticated requests
    headers = {
        'Authorization': f'Bearer {access_token}',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    # Verify credentials by getting user info
    user_info_response = requests.get(
        'https://www.googleapis.com/oauth2/v2/userinfo',
        headers=headers
    )
    
    if user_info_response.status_code == 200:
        user_info = user_info_response.json()
        print(f"‚úîÔ∏è Authenticated as: {user_info.get('email')}")
        print("Ready to scrape the website!")
    else:
        print("‚úñÔ∏è Could not verify access token")
else:
    print("‚úñÔ∏è Failed to obtain access token")

Opening browser for authorization...
Authorization URL: https://accounts.google.com/o/oauth2/v2/auth?client_id=662017523321-1mrc6ri6juhoffk07l8d79g3isanqlac.apps.googleusercontent.com&redirect_uri=http://localhost:8000/callback&response_type=code&scope=https://www.googleapis.com/auth/userinfo.email
‚úñÔ∏è Authorization failed: No auth code received
‚úñÔ∏è Failed to obtain access token


In [None]:
# Web Scraper Trustpilot
from bs4 import BeautifulSoup
import time

def scrape_trustpilot_page(url, headers):
    """
    Scrape une page Trustpilot et extrait les avis
    """
    print(f"\nüîç Scraping: {url}")
    
    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Exemple d'extraction - adapter selon la structure r√©elle de Trustpilot
            reviews = []
            
            # Trouver tous les conteneurs d'avis (les s√©lecteurs peuvent varier)
            review_cards = soup.find_all('article', class_='review')  # √Ä ajuster
            
            if not review_cards:
                # Essayer d'autres s√©lecteurs possibles
                review_cards = soup.find_all('div', attrs={'data-service-review-card-paper': True})
            
            print(f"‚úîÔ∏è Trouv√© {len(review_cards)} avis sur cette page")
            
            for card in review_cards:
                review = {}
                
                # Titre de l'avis
                title_elem = card.find('h2', class_='typography_heading-s')
                review['title'] = title_elem.get_text(strip=True) if title_elem else "N/A"
                
                # Note (√©toiles)
                rating_elem = card.find('div', class_='star-rating')
                if rating_elem and rating_elem.find('img'):
                    alt_text = rating_elem.find('img')['alt']
                    review['rating'] = alt_text
                else:
                    review['rating'] = "N/A"
                
                # Texte de l'avis
                text_elem = card.find('p', class_='typography_body-l')
                review['text'] = text_elem.get_text(strip=True) if text_elem else "N/A"
                
                # Auteur
                author_elem = card.find('span', attrs={'data-consumer-name-typography': True})
                review['author'] = author_elem.get_text(strip=True) if author_elem else "Anonymous"
                
                # Date
                date_elem = card.find('time')
                review['date'] = date_elem.get('datetime') if date_elem else "N/A"
                
                reviews.append(review)
            
            return reviews
        else:
            print(f"‚úñÔ∏è Erreur HTTP {response.status_code}")
            return []
            
    except Exception as e:
        print(f"‚úñÔ∏è Erreur lors du scraping: {e}")
        return []

def scrape_multiple_pages(base_url, start_page, end_page, headers):
    """
    Scrape plusieurs pages de Trustpilot
    """
    all_reviews = []
    
    for page_num in range(start_page, end_page + 1):
        url = f"{base_url}?page={page_num}"
        reviews = scrape_trustpilot_page(url, headers)
        all_reviews.extend(reviews)
        
        # Pause pour √©viter de surcharger le serveur
        if page_num < end_page:
            print(f"‚è≥ Pause de 2 secondes avant la page suivante...")
            time.sleep(2)
    
    return all_reviews

def get_headers(access_token):
    return { 'Authorization': f'Bearer {access_token}',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1' }       

In [30]:
# ======== MAIN MONO PAGE ====================
# Get the access token
access_token = get_google_access_token()

if access_token:
    # Headers pour les requ√™tes authentifi√©es
    headers = get_headers(access_token)
    
    # V√©rifier les credentials
    user_info_response = requests.get('https://www.googleapis.com/oauth2/v2/userinfo', headers=headers)
    
    if user_info_response.status_code == 200:
        user_info = user_info_response.json()
        print(f"‚úîÔ∏è Authenticated as: {user_info.get('email')}")
        print("\n" + "="*50)
        print("üöÄ Ready to scrape Trustpilot!")
        print("="*50)
        
        # Configuration du scraping
        company = "nookmart.com"
        base_url = f"https://www.trustpilot.com/review/{company}"
        
        # Exemple 1 : Scraper une seule page
        print("\nüìÑ Exemple 1: Scraping de la page 11")
        reviews_page_11 = scrape_trustpilot_page(
            f"{base_url}?page=11", 
            headers
        )
        
        # Afficher les r√©sultats
        for i, review in enumerate(reviews_page_11, 1):
            print(f"\n--- Avis {i} ---")
            print(f"Titre: {review['title']}")
            print(f"Note: {review['rating']}")
            print(f"Auteur: {review['author']}")
            print(f"Date: {review['date']}")
            print(f"Texte: {review['text'][:100]}...")
                
    else:
        print("‚úñÔ∏è Could not verify access token")
else:
    print("‚úñÔ∏è Failed to obtain access token")

Opening browser for authorization...
Authorization URL: https://accounts.google.com/o/oauth2/v2/auth?client_id=662017523321-1mrc6ri6juhoffk07l8d79g3isanqlac.apps.googleusercontent.com&redirect_uri=http://localhost:8000/callback&response_type=code&scope=https://www.googleapis.com/auth/userinfo.email
‚úîÔ∏è Authorization code received: 4/0ASc3gC2...
‚úîÔ∏è Access token obtained! (expires in 3599 seconds)
‚úîÔ∏è Authenticated as: lionel.gontier@gmail.com

üöÄ Ready to scrape Trustpilot!

üìÑ Exemple 1: Scraping de la page 11

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=11
‚úîÔ∏è Trouv√© 0 avis sur cette page


In [34]:
# ======== MAIN MULTI PAGE ====================
# Get the access token
access_token = get_google_access_token()

if access_token:
    # Headers pour les requ√™tes authentifi√©es
    headers = get_headers(access_token)
    
    # V√©rifier les credentials
    user_info_response = requests.get('https://www.googleapis.com/oauth2/v2/userinfo', headers=headers)
    
    if user_info_response.status_code == 200:
        user_info = user_info_response.json()
        print(f"‚úîÔ∏è Authenticated as: {user_info.get('email')}")
        print("\n" + "="*50)
        print("üöÄ Ready to scrape Trustpilot!")
        print("="*50)
        
        # Configuration du scraping
        base_url = "https://www.trustpilot.com/review/nookmart.com"
                
        # Exemple 2 : Scraper plusieurs pages (11 √† 13)
        print("\n" + "="*50)
        print("üìö Exemple 2: Scraping des pages 11-13")
        print("="*50)
        all_reviews = scrape_multiple_pages(base_url, 11, 13, headers)
        print(f"\n‚úîÔ∏è Total de {len(all_reviews)} avis r√©cup√©r√©s")
        
        # Sauvegarder dans un fichier CSV (optionnel)
        import csv
        with open('trustpilot_reviews.csv', 'w', newline='', encoding='utf-8') as f:
            if all_reviews:
                writer = csv.DictWriter(f, fieldnames=all_reviews[0].keys())
                writer.writeheader()
                writer.writerows(all_reviews)
                print(f"üíæ Avis sauvegard√©s dans 'trustpilot_reviews.csv'")
        
    else:
        print("‚úñÔ∏è Could not verify access token")
else:
    print("‚úñÔ∏è Failed to obtain access token")

Opening browser for authorization...
Authorization URL: https://accounts.google.com/o/oauth2/v2/auth?client_id=662017523321-1mrc6ri6juhoffk07l8d79g3isanqlac.apps.googleusercontent.com&redirect_uri=http://localhost:8000/callback&response_type=code&scope=https://www.googleapis.com/auth/userinfo.email
‚úîÔ∏è Authorization code received: 4/0ASc3gC1...
‚úîÔ∏è Access token obtained! (expires in 3599 seconds)
‚úîÔ∏è Authenticated as: lionel.gontier@gmail.com

üöÄ Ready to scrape Trustpilot!

üìö Exemple 2: Scraping des pages 11-13

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=11
‚úîÔ∏è Trouv√© 0 avis sur cette page
‚è≥ Pause de 2 secondes avant la page suivante...

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=12
‚úîÔ∏è Trouv√© 0 avis sur cette page
‚è≥ Pause de 2 secondes avant la page suivante...

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=13
‚úîÔ∏è Trouv√© 0 avis sur cette page

‚úîÔ∏è Total de 0 avis r√©cup√©r√©s


In [43]:
# ======== MAIN MUTLI PAGE avec scraper v√©rifi√©/valid√© (les bons tags) ====================
# Get the access token
def scrapTP(company,pageRange):
    access_token = get_google_access_token()

    if access_token:
        # Headers pour les requ√™tes authentifi√©es avec le access_token
        headers = get_headers(access_token)
        
        # V√©rifier les credentials
        user_info_response = requests.get('https://www.googleapis.com/oauth2/v2/userinfo', headers=headers)
        
        if user_info_response.status_code == 200:
            user_info = user_info_response.json()
            print(f"‚úîÔ∏è Authenticated as: {user_info.get('email')}")
            print("\n" + "="*50)
            print("üöÄ Ready to scrape Trustpilot!")
            print("="*50)
            
            # Exemple 1 : Scraper une seule page
            print(f"\nüìÑ Exemple 1: Scraping de {pageRange}")
            scrape_trustpilot_reviews(company,pageRange,headers=headers)
                            
        else:
            print("‚úñÔ∏è Could not verify access token")
    else:
        print("‚úñÔ∏è Failed to obtain access token")

In [49]:
# CALL THE SCRAPPER
scrapTP("nookmart.com",range(10,12))

Opening browser for authorization...
Authorization URL: https://accounts.google.com/o/oauth2/v2/auth?client_id=662017523321-1mrc6ri6juhoffk07l8d79g3isanqlac.apps.googleusercontent.com&redirect_uri=http://localhost:8000/callback&response_type=code&scope=https://www.googleapis.com/auth/userinfo.email
‚úîÔ∏è Authorization code received: 4/0ASc3gC3...
‚úîÔ∏è Access token obtained! (expires in 3599 seconds)
‚úîÔ∏è Authenticated as: lionel.gontier@gmail.com

üöÄ Ready to scrape Trustpilot!

üìÑ Exemple 1: Scraping de range(10, 12)
<Response [200]>
Scraping page https://www.trustpilot.com/review/nookmart.com?page=10
Page 10: Found 20 reviews
<div data-testid="service-review-card-v2"><div class="styles_reviewCardInnerHeader__8Xqy8"><aside aria-label="Info for Sophie" class="styles_consumerInfoWrapper__6HN5O"><div class="styles_consumerDetailsWrapper__4eZod"><div class="CDS_Avatar_avatar__dd7fc3 CDS_Avatar_yellow__dd7fc3" data-testid="consumer-avatar" style="width:44px;min-width:44px;height:4

In [53]:
import os
import webbrowser
from dotenv import load_dotenv
import requests
from http.server import HTTPServer, BaseHTTPRequestHandler
import threading
from urllib.parse import urlparse, parse_qs, unquote
from bs4 import BeautifulSoup
import time

load_dotenv()

# Load OAuth credentials from .env
CLIENT_ID = os.getenv("GOOGLE_CLIENT_ID")
CLIENT_SECRET = os.getenv("GOOGLE_CLIENT_SECRET")
REDIRECT_URI = os.getenv("GOOGLE_REDIRECT_URI", "http://localhost:8000/callback")

# Store auth code globally
auth_code = None

class CallbackHandler(BaseHTTPRequestHandler):
    def do_GET(self):
        global auth_code
        
        parsed_url = urlparse(self.path)
        query_params = parse_qs(parsed_url.query)
        
        if 'code' in query_params:
            auth_code = unquote(query_params['code'][0])
            
            self.send_response(200)
            self.send_header('Content-type', 'text/html')
            self.end_headers()
            self.wfile.write(b"<h1>Authorization successful! You can close this window.</h1>")
            print(f"‚úì Authorization code received: {auth_code[:10]}...")
        else:
            self.send_response(400)
            self.send_header('Content-type', 'text/html')
            self.end_headers()
            self.wfile.write(b"<h1>Authorization failed! No code received.</h1>")
    
    def log_message(self, format, *args):
        pass

def get_google_access_token():
    """Get Google access token"""
    global auth_code
    
    auth_url = f"https://accounts.google.com/o/oauth2/v2/auth?client_id={CLIENT_ID}&redirect_uri={REDIRECT_URI}&response_type=code&scope=https://www.googleapis.com/auth/userinfo.email"
    
    print("Opening browser for authorization...")
    webbrowser.open(auth_url)
    
    server = HTTPServer(('localhost', 8000), CallbackHandler)
    server_thread = threading.Thread(target=server.handle_request)
    server_thread.daemon = True
    server_thread.start()
    server_thread.join(timeout=60)
    server.server_close()
    
    if not auth_code:
        print("‚ùå Authorization failed: No auth code received")
        return None
    
    token_url = "https://oauth2.googleapis.com/token"
    token_data = {
        'code': auth_code,
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
        'redirect_uri': REDIRECT_URI,
        'grant_type': 'authorization_code'
    }
    
    response = requests.post(token_url, data=token_data)
    
    if response.status_code == 200:
        token_info = response.json()
        access_token = token_info['access_token']
        print(f"‚úì Access token obtained! (expires in {token_info.get('expires_in')} seconds)")
        return access_token
    else:
        print(f"‚ùå Token exchange failed: {response.text}")
        return None

def scrape_trustpilot_page(url, headers):
    """
    Scrape une page Trustpilot et extrait les avis
    """
    print(f"\nüîç Scraping: {url}")
    
    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Exemple d'extraction - adapter selon la structure r√©elle de Trustpilot
            reviews = []
            
            # Trouver tous les conteneurs d'avis (les s√©lecteurs peuvent varier)
            review_cards = soup.find_all('article', class_='review')  # √Ä ajuster
            
            if not review_cards:
                # Essayer d'autres s√©lecteurs possibles
                review_cards = soup.find_all('div', attrs={"data-testid": "service-review-card-v2"})
            
            print(f"‚úì Trouv√© {len(review_cards)} avis sur cette page")
            
            for card in review_cards:
                review = {}
                
                # Titre de l'avis
                title_elem = card.find('h2', class_='typography_heading-s')
                review['title'] = title_elem.get_text(strip=True) if title_elem else "N/A"
                
                # Note (√©toiles)
                rating_elem = card.find('div', class_='star-rating')
                if rating_elem and rating_elem.find('img'):
                    alt_text = rating_elem.find('img')['alt']
                    review['rating'] = alt_text
                else:
                    review['rating'] = "N/A"
                
                # Texte de l'avis
                text_elem = card.find('p', class_='typography_body-l')
                review['text'] = text_elem.get_text(strip=True) if text_elem else "N/A"
                
                # Auteur
                author_elem = card.find('span', attrs={'data-consumer-name-typography': True})
                review['author'] = author_elem.get_text(strip=True) if author_elem else "Anonymous"
                
                # Date
                date_elem = card.find('time')
                review['date'] = date_elem.get('datetime') if date_elem else "N/A"
                
                reviews.append(review)
            
            return reviews
        else:
            print(f"‚ùå Erreur HTTP {response.status_code}")
            return []
            
    except Exception as e:
        print(f"‚ùå Erreur lors du scraping: {e}")
        return []

def scrape_multiple_pages(base_url, start_page, end_page, headers):
    """
    Scrape plusieurs pages de Trustpilot
    """
    all_reviews = []
    
    for page_num in range(start_page, end_page + 1):
        url = f"{base_url}?page={page_num}"
        reviews = scrape_trustpilot_page(url, headers)
        all_reviews.extend(reviews)
        
        # Pause pour √©viter de surcharger le serveur
        if page_num < end_page:
            print(f"‚è≥ Pause de 2 secondes avant la page suivante...")
            time.sleep(2)
    
    return all_reviews

# ==================== MAIN ====================

# Get the access token
access_token = get_google_access_token()

if access_token:
    # Headers pour les requ√™tes authentifi√©es
    headers = {
        'Authorization': f'Bearer {access_token}',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    }
    
    # V√©rifier les credentials
    user_info_response = requests.get(
        'https://www.googleapis.com/oauth2/v2/userinfo',
        headers=headers
    )
    
    if user_info_response.status_code == 200:
        user_info = user_info_response.json()
        print(f"‚úÖ Authenticated as: {user_info.get('email')}")
        print("\n" + "="*50)
        print("üöÄ Ready to scrape Trustpilot!")
        print("="*50)
        
        # Configuration du scraping
        base_url = "https://www.trustpilot.com/review/nookmart.com"
        
        # Exemple 1 : Scraper une seule page
        print("\nüìÑ Exemple 1: Scraping de la page 11")
        reviews_page_11 = scrape_trustpilot_page(
            f"{base_url}?page=11", 
            headers
        )
        
        # Afficher les r√©sultats
        for i, review in enumerate(reviews_page_11, 1):
            print(f"\n--- Avis {i} ---")
            print(f"Titre: {review['title']}")
            print(f"Note: {review['rating']}")
            print(f"Auteur: {review['author']}")
            print(f"Date: {review['date']}")
            print(f"Texte: {review['text'][:100]}...")
        
        # Exemple 2 : Scraper plusieurs pages (11 √† 13)
        print("\n" + "="*50)
        print("üìö Exemple 2: Scraping des pages 11-13")
        print("="*50)
        all_reviews = scrape_multiple_pages(base_url, 9, 13, headers)
        print(f"\n‚úÖ Total de {len(all_reviews)} avis r√©cup√©r√©s")
        
        # Sauvegarder dans un fichier CSV (optionnel)
        import csv
        with open('trustpilot_reviews.csv', 'w', newline='', encoding='utf-8') as f:
            if all_reviews:
                writer = csv.DictWriter(f, fieldnames=all_reviews[0].keys())
                writer.writeheader()
                writer.writerows(all_reviews)
                print(f"üíæ Avis sauvegard√©s dans 'trustpilot_reviews.csv'")
        
    else:
        print("‚ùå Could not verify access token")
else:
    print("‚ùå Failed to obtain access token")

Opening browser for authorization...
‚úì Authorization code received: 4/0ASc3gC3...
‚úì Access token obtained! (expires in 3599 seconds)
‚úÖ Authenticated as: lionel.gontier@gmail.com

üöÄ Ready to scrape Trustpilot!

üìÑ Exemple 1: Scraping de la page 11

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=11
‚úì Trouv√© 0 avis sur cette page

üìö Exemple 2: Scraping des pages 11-13

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=9
‚úì Trouv√© 20 avis sur cette page
‚è≥ Pause de 2 secondes avant la page suivante...

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=10
‚úì Trouv√© 20 avis sur cette page
‚è≥ Pause de 2 secondes avant la page suivante...

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=11
‚úì Trouv√© 0 avis sur cette page
‚è≥ Pause de 2 secondes avant la page suivante...

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=12
‚úì Trouv√© 0 avis sur cette page
‚è≥ Pause de 2 secondes 

In [57]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import csv

def get_chrome_profile_path():
    """
    Obtenir le chemin du profil Chrome par d√©faut
    """
    # Chemin Windows
    user_profile = os.environ.get('USERPROFILE')
    chrome_profile = os.path.join(user_profile, 'AppData', 'Local', 'Google', 'Chrome', 'User Data')
    return chrome_profile

def scrape_trustpilot_with_existing_session():
    """
    Utiliser une session Chrome d√©j√† connect√©e
    """
    chrome_options = Options()
    
    # OPTION A: Utiliser votre profil Chrome principal
    chrome_profile_path = get_chrome_profile_path()
    chrome_options.add_argument(f"user-data-dir={chrome_profile_path}")
    chrome_options.add_argument("profile-directory=Default")  # Ou "Profile 1", "Profile 2", etc.
    
    # Options suppl√©mentaires
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--start-maximized")
    
    # ‚ö†Ô∏è IMPORTANT: Fermez TOUS les navigateurs Chrome avant d'ex√©cuter ce script
    # Sinon vous aurez une erreur car Chrome est d√©j√† en cours d'utilisation
    
    print("üöÄ D√©marrage de Chrome avec votre profil existant...")
    print("‚ö†Ô∏è Assurez-vous d'avoir ferm√© tous les autres Chrome d'abord!")
    
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        # Aller directement sur Trustpilot
        print("\nüåê Navigation vers Trustpilot...")
        driver.get("https://www.trustpilot.com/review/nookmart.com?page=10")
        
        print("\nüìã V√©rifiez que vous √™tes bien connect√© dans le navigateur")
        print("   Si vous n'√™tes pas connect√©, connectez-vous maintenant (60 secondes)...")
        time.sleep(60)
        
        # Scraper les pages
        base_url = "https://www.trustpilot.com/review/nookmart.com"
        all_reviews = scrape_multiple_pages(driver, base_url, 10, 12)
        
        print(f"\n‚úÖ Total: {len(all_reviews)} avis r√©cup√©r√©s")
        
        # Sauvegarder
        if all_reviews:
            save_to_csv(all_reviews, 'trustpilot_reviews.csv')
        
        print("\n‚è≥ Navigateur reste ouvert 10 secondes...")
        time.sleep(10)
        
    finally:
        driver.quit()
        print("üëã Navigateur ferm√©")

def scrape_multiple_pages(driver, base_url, start_page, end_page):
    """Scraper plusieurs pages"""
    all_reviews = []
    
    for page_num in range(start_page, end_page + 1):
        url = f"{base_url}?page={page_num}"
        reviews = scrape_page(driver, url)
        all_reviews.extend(reviews)
        
        if page_num < end_page:
            time.sleep(3)
    
    return all_reviews

def scrape_page(driver, url):
    """Scraper une page"""
    print(f"\nüîç Scraping: {url}")
    driver.get(url)
    time.sleep(4)
    
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Chercher les avis
    review_cards = soup.find_all('article')
    print(f"   ‚úì Trouv√© {len(review_cards)} √©l√©ments")
    
    reviews = []
    for card in review_cards:
        review = extract_review_data(card)
        if review['title'] != "N/A":  # Filtrer les √©l√©ments vides
            reviews.append(review)
    
    print(f"   ‚úì {len(reviews)} avis valides extraits")
    return reviews

def extract_review_data(card):
    """Extraire les donn√©es d'un avis"""
    review = {}
    
    # Titre
    title_elem = card.find('h2')
    review['title'] = title_elem.get_text(strip=True) if title_elem else "N/A"
    
    # Note
    rating_elem = card.find('div', class_=lambda x: x and 'star-rating' in x.lower())
    if rating_elem:
        img = rating_elem.find('img')
        review['rating'] = img.get('alt', 'N/A') if img else "N/A"
    else:
        review['rating'] = "N/A"
    
    # Texte
    text_elem = card.find('p', class_=lambda x: x and 'body' in x.lower())
    if not text_elem:
        paragraphs = card.find_all('p')
        text_elem = paragraphs[0] if paragraphs else None
    review['text'] = text_elem.get_text(strip=True) if text_elem else "N/A"
    
    # Auteur
    author_elem = card.find('span', attrs={'data-consumer-name-typography': True})
    if not author_elem:
        author_elem = card.find('span', class_=lambda x: x and 'heading' in x.lower())
    review['author'] = author_elem.get_text(strip=True) if author_elem else "Anonymous"
    
    # Date
    date_elem = card.find('time')
    review['date'] = date_elem.get('datetime', 'N/A') if date_elem else "N/A"
    
    return review

def save_to_csv(reviews, filename):
    """Sauvegarder en CSV"""
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=reviews[0].keys())
        writer.writeheader()
        writer.writerows(reviews)
    print(f"üíæ {len(reviews)} avis sauvegard√©s dans '{filename}'")

if __name__ == "__main__":
    scrape_trustpilot_with_existing_session()

üöÄ D√©marrage de Chrome avec votre profil existant...
‚ö†Ô∏è Assurez-vous d'avoir ferm√© tous les autres Chrome d'abord!


SessionNotCreatedException: Message: session not created: Chrome failed to start: crashed.
  (session not created: DevToolsActivePort file doesn't exist)
  (The process started from chrome location C:\Program Files\Google\Chrome\Application\chrome.exe is no longer running, so ChromeDriver is assuming that Chrome has crashed.); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff6dadcf3d5
	0x7ff6dadcf430
	0x7ff6dab710bd
	0x7ff6dabb1ccb
	0x7ff6dabacf99
	0x7ff6dac04091
	0x7ff6dac038f6
	0x7ff6dabbcb0c
	0x7ff6dabbda53
	0x7ff6db0ab470
	0x7ff6db0a586d
	0x7ff6db0c621a
	0x7ff6dadeb235
	0x7ff6dadf3a5c
	0x7ff6dadd8844
	0x7ff6dadd89f6
	0x7ff6dadbeb87
	0x7ffe04f5e8d7
	0x7ffe0666c53c


In [58]:
# ==================== MAIN SANS AUTHENTIFICATION (JSON EMBARQU√â) ====================
import requests
from bs4 import BeautifulSoup
import json
import csv
import time

def scrape_trustpilot_reviews(company_url, page_number):
    """
    Scraper Trustpilot SANS authentification en utilisant le JSON embarqu√©
    """
    url = f"{company_url}?page={page_number}"
    
    print(f"üîç Scraping: {url}")
    
    # Headers pour simuler un navigateur
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    }
    
    try:
        # Requ√™te HTTP simple
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            print(f"‚ùå Erreur HTTP {response.status_code}")
            return []
        
        # Parser le HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # üéØ Trouver le script contenant les donn√©es JSON
        script_tag = soup.find('script', id='__NEXT_DATA__', type='application/json')
        
        if not script_tag:
            print("‚ùå Script __NEXT_DATA__ non trouv√©!")
            # Sauvegarder pour debug
            with open(f'debug_page_{page_number}.html', 'w', encoding='utf-8') as f:
                f.write(response.text)
            print(f"   HTML sauvegard√© dans debug_page_{page_number}.html")
            return []
        
        # Parser le JSON
        json_data = json.loads(script_tag.string)
        
        # Naviguer dans la structure JSON pour trouver les avis
        # Structure: props -> pageProps -> reviews
        try:
            page_props = json_data['props']['pageProps']
            reviews = page_props.get('reviews', [])
            
            print(f"   ‚úì Trouv√© {len(reviews)} avis dans le JSON")
            
            # Extraire les donn√©es de chaque avis
            extracted_reviews = []
            for review in reviews:
                review_data = {
                    'title': review.get('title', 'N/A'),
                    'text': review.get('text', 'N/A'),
                    'rating': review.get('rating', 'N/A'),
                    'author': review.get('consumer', {}).get('displayName', 'Anonymous'),
                    'date': review.get('dates', {}).get('publishedDate', 'N/A'),
                    'verified': review.get('isVerified', False),
                    'location': review.get('consumer', {}).get('countryCode', 'N/A')
                }
                extracted_reviews.append(review_data)
            
            return extracted_reviews
            
        except KeyError as e:
            print(f"‚ùå Structure JSON inattendue: {e}")
            # Sauvegarder le JSON pour inspection
            with open(f'debug_json_{page_number}.json', 'w', encoding='utf-8') as f:
                json.dump(json_data, f, indent=2)
            print(f"   JSON sauvegard√© dans debug_json_{page_number}.json")
            return []
        
    except requests.RequestException as e:
        print(f"‚ùå Erreur de connexion: {e}")
        return []
    except json.JSONDecodeError as e:
        print(f"‚ùå Erreur de parsing JSON: {e}")
        return []

def scrape_multiple_pages(company_url, start_page, end_page):
    """
    Scraper plusieurs pages
    """
    all_reviews = []
    
    for page_num in range(start_page, end_page + 1):
        reviews = scrape_trustpilot_reviews(company_url, page_num)
        all_reviews.extend(reviews)
        
        print(f"   ‚úì {len(reviews)} avis extraits de la page {page_num}")
        
        # Pause pour √™tre poli avec le serveur
        if page_num < end_page:
            print(f"   ‚è≥ Pause 2 secondes...")
            time.sleep(2)
    
    return all_reviews

def save_to_csv(reviews, filename):
    """
    Sauvegarder les avis en CSV
    """
    if not reviews:
        print("‚ö†Ô∏è Aucun avis √† sauvegarder")
        return
    
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=reviews[0].keys())
        writer.writeheader()
        writer.writerows(reviews)
    
    print(f"üíæ {len(reviews)} avis sauvegard√©s dans '{filename}'")

# ==================== MAIN ====================

if __name__ == "__main__":
    print("="*60)
    print("üöÄ TRUSTPILOT SCRAPER - SANS AUTHENTIFICATION")
    print("="*60)
    
    # Configuration
    company_url = "https://www.trustpilot.com/review/nookmart.com"
    
    # Test sur une seule page d'abord
    print("\nüìÑ Test: Page 10")
    test_reviews = scrape_trustpilot_reviews(company_url, 10)
    
    if test_reviews:
        print("\n‚úÖ Test r√©ussi! Voici les 3 premiers avis:")
        for i, review in enumerate(test_reviews[:3], 1):
            print(f"\n--- Avis {i} ---")
            print(f"Titre: {review['title']}")
            print(f"Note: {review['rating']}/5")
            print(f"Auteur: {review['author']} ({review['location']})")
            print(f"V√©rifi√©: {review['verified']}")
            print(f"Date: {review['date']}")
            print(f"Texte: {review['text'][:100]}...")
        
        # Si le test fonctionne, scraper plusieurs pages
        print("\n" + "="*60)
        print("üìö Scraping des pages 10-12")
        print("="*60)
        
        all_reviews = scrape_multiple_pages(company_url, 10, 12)
        
        print(f"\n‚úÖ Total: {len(all_reviews)} avis r√©cup√©r√©s")
        
        # Sauvegarder
        save_to_csv(all_reviews, 'trustpilot_reviews.csv')
        
        # Statistiques
        if all_reviews:
            avg_rating = sum(r['rating'] for r in all_reviews) / len(all_reviews)
            verified_count = sum(1 for r in all_reviews if r['verified'])
            print(f"\nüìä Statistiques:")
            print(f"   Note moyenne: {avg_rating:.2f}/5")
            print(f"   Avis v√©rifi√©s: {verified_count}/{len(all_reviews)}")
    else:
        print("\n‚ùå Le test a √©chou√©. V√©rifiez les fichiers de debug.")

üöÄ TRUSTPILOT SCRAPER - SANS AUTHENTIFICATION

üìÑ Test: Page 10
üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=10
   ‚úì Trouv√© 20 avis dans le JSON

‚úÖ Test r√©ussi! Voici les 3 premiers avis:

--- Avis 1 ---
Titre: Highly recommend 
Note: 5/5
Auteur: Sophie (GB)
V√©rifi√©: False
Date: 2026-01-28T16:50:14.000Z
Texte: Always very quick delivery and friendly. Would highly recommend this site for purchasing animal cros...

--- Avis 2 ---
Titre: Delivery was very fast 
Note: 5/5
Auteur: Consumer (AU)
V√©rifi√©: False
Date: 2026-01-28T13:24:22.000Z
Texte: Delivery was very fast. Had an estimate of 15 minutes and it was less then 5. 10/10 will use this se...

--- Avis 3 ---
Titre: Great fast service and superb customer‚Ä¶
Note: 5/5
Auteur: Popeyebear (GB)
V√©rifi√©: False
Date: 2026-01-28T12:19:13.000Z
Texte: Great fast service and superb customer support ...

üìö Scraping des pages 10-12
üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=10
   ‚úì

In [61]:

reviews = scrape_trustpilot_reviews("https://www.trustpilot.com/review/nookmart.com", 9)

üîç Scraping: https://www.trustpilot.com/review/nookmart.com?page=9
   ‚úì Trouv√© 20 avis dans le JSON


In [67]:
from datasets import load_dataset
import os

os.environ["WANDB_MODE"]="offline"
df = load_dataset('csv', data_files={'train': "/kaggle/input/trustpilot-reviews-123k/trustpilot_reviews_2005.csv"})
display(df.head())

FileNotFoundError: Unable to find 'C:/kaggle/input/trustpilot-reviews-123k/trustpilot_reviews_2005.csv'

In [88]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jerassy/trustpilot-reviews-123k")

print("Path to dataset files:", path)

#df = load_dataset('csv', data_files={'train': f"{path}/trustpilot_reviews_2005.csv"})
#display(df.head())
if False:
    df=pd.read_csv(f"{path}/trustpilot_reviews_2005.csv")
    
    
display(df.head(),df.info(),df.shape, df.isna().sum())  

display(df.groupby(df.category).agg({"stars":["mean","median"], "company":["count","nunique"]}))


Path to dataset files: C:\Users\lione\.cache\kagglehub\datasets\jerassy\trustpilot-reviews-123k\versions\1
<class 'pandas.DataFrame'>
RangeIndex: 123181 entries, 0 to 123180
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   category     123181 non-null  str  
 1   company      123181 non-null  str  
 2   description  123181 non-null  str  
 3   title        123181 non-null  str  
 4   review       123181 non-null  str  
 5   stars        123181 non-null  int64
dtypes: int64(1), str(5)
memory usage: 5.6 MB


Unnamed: 0,category,company,description,title,review,stars
0,Animals & Pets,ruffandtumbledogcoats.com,At Ruff and Tumble we are proud to be the mark...,Great quality dog drying robe although‚Ä¶,Great quality dog drying robe although had to ...,5
1,Animals & Pets,ruffandtumbledogcoats.com,At Ruff and Tumble we are proud to be the mark...,Really prompt service,"Really prompt service, The sofa covers have no...",5
2,Animals & Pets,ruffandtumbledogcoats.com,At Ruff and Tumble we are proud to be the mark...,Life saver,I‚Äôve purchased first of those coats in May2020...,5
3,Animals & Pets,ruffandtumbledogcoats.com,At Ruff and Tumble we are proud to be the mark...,Brilliant coats,Brilliant coats. Really like the limited editi...,5
4,Animals & Pets,ruffandtumbledogcoats.com,At Ruff and Tumble we are proud to be the mark...,Great company and products,Great company and products. This is my 3rd dry...,5


None

(123181, 6)

category       0
company        0
description    0
title          0
review         0
stars          0
dtype: int64

Unnamed: 0_level_0,stars,stars,company,company
Unnamed: 0_level_1,mean,median,count,nunique
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Animals & Pets,3.184539,3.0,5446,83
Beauty & Well-being,3.257357,3.0,5199,72
Business Services,3.252181,3.0,6190,89
Construction & Manufacturing,3.124368,3.0,5339,67
Education & Training,3.25216,4.0,6365,86
Electronics & Technology,3.172802,3.0,5596,69
Events & Entertainment,3.164402,3.0,5298,76
"Food, Beverages & Tobacco",3.196607,3.0,5895,93
Health & Medical,3.178711,3.0,5646,76
Hobbies & Crafts,3.388878,4.0,5323,84
