In [85]:
from bs4 import BeautifulSoup
import mechanize

### Creating a Browser Instance

In [86]:
br = mechanize.Browser()
br.set_handle_robots(False)  # Google demands a user-agent that isn't a robot

In [87]:
br.addheaders = [("User-agent", "Edge")]  # if chrome is not installed, you may also select 'firefox'

### Opening a web page

In [88]:
# Accessing Lidl web page
response = br.open("https://uk.trustpilot.com/review/www.lidl.co.uk")

## Exercise 2: Scraping the Reviews

In [89]:
web_page = response.read()
soup = BeautifulSoup(web_page, "html.parser")

In [90]:
# This gets all the reviews from the store
soup.select('article')

[<article class="styles_reviewCard__wHynN" data-service-review-card-paper="true"><aside aria-label="Info for J Q" class="styles_consumerInfoWrapper__uQZDN"><div class="styles_consumerDetailsWrapper__LSBJS"><div class="avatar_imageWrapper__9hWrp" style="width:44px;height:44px;min-width:44px;min-height:44px"><img alt="" data-consumer-avatar-image="true" data-nimg="1" decoding="async" height="44" loading="lazy" src="https://user-images.trustpilot.com/67dc432e20cf684fc766a783/73x73.png" style="color:transparent" width="44"/></div><a class="link_internal__Eam_b link_wrapper__ahpyq styles_consumerDetails__qg84T" data-consumer-profile-link="true" href="/users/67dc432e20cf684fc766a783" name="consumer-profile" rel="nofollow" target="_self"><span class="typography_heading-xs__osRhC typography_appearance-default__t8iAq" data-consumer-name-typography="true">J Q</span><div class="styles_consumerExtraDetails__TylYM" data-consumer-reviews-count="1"><div class="typography_body-m__k2UI7 typography_appe

In [91]:
# This gets the first review from the store
soup.select('article')[:1]

[<article class="styles_reviewCard__wHynN" data-service-review-card-paper="true"><aside aria-label="Info for J Q" class="styles_consumerInfoWrapper__uQZDN"><div class="styles_consumerDetailsWrapper__LSBJS"><div class="avatar_imageWrapper__9hWrp" style="width:44px;height:44px;min-width:44px;min-height:44px"><img alt="" data-consumer-avatar-image="true" data-nimg="1" decoding="async" height="44" loading="lazy" src="https://user-images.trustpilot.com/67dc432e20cf684fc766a783/73x73.png" style="color:transparent" width="44"/></div><a class="link_internal__Eam_b link_wrapper__ahpyq styles_consumerDetails__qg84T" data-consumer-profile-link="true" href="/users/67dc432e20cf684fc766a783" name="consumer-profile" rel="nofollow" target="_self"><span class="typography_heading-xs__osRhC typography_appearance-default__t8iAq" data-consumer-name-typography="true">J Q</span><div class="styles_consumerExtraDetails__TylYM" data-consumer-reviews-count="1"><div class="typography_body-m__k2UI7 typography_appe

### Getting Authors of the reviews

In [92]:
# Iterate through each <article> tag (each review) 
for article in soup.select('article')[:10]:
    # finding the tags associated to the article
    author_element = article.find('span', class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq", 
                                  attrs={"data-consumer-name-typography": "true"})
    if author_element:
        review_author = author_element.text.strip()
        print(review_author)
    else:
        print("No Title Found!")

J Q
Elizabeth Beirne
Mohammed
Magda Fellino
Stephen
Adam Cadman
Christine Espley
Mohammed
J Q
Elizabeth Beirne


### Getting the title of review

In [93]:
# Iterate through each <article> tag (each review) 
for article in soup.select('article')[:10]:
    # finding the tags associated to the review title
    title_element = article.find('h2', class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq", 
                                  attrs={"data-service-review-title-typography": "true"})
    if title_element:
        title = title_element.text.strip()
        print(title)
    else:
        print("No Title Found!")

No Title Found!
No Title Found!
No Title Found!
No Title Found!
Con artists
Lidl - my most useful supermarket!
Nordic “promotion” my aunt sally.
Arrogant aggressive accusing blaming dishonest managers
Lawrence Hill store
huge clump of hair in cretan pies…nice


### Combining the code to get both author and review title

In [94]:
for article in soup.select('article')[:10]:
    try:
        # Correctly extract author using proper class name
        author_tag = article.find('span', class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq")
        author = author_tag.text.strip() if author_tag else None  # Fixed ternary operator syntax
        
        # Extract title from h2 tag with proper handling
        title_tag = article.find('h2')
        title = title_tag.text.strip() if title_tag else None  # Added .strip() for cleanliness

    except Exception as inner_e:
        print(f"Error processing a review: {inner_e}")
        continue  # Skip to next iteration if error occurs

    # Print statements moved outside the try-except block
    print(f"Author: {author}")
    print(f"Title: {title}")
    print("-" * 20)  # Separator between reviews

Author: J Q
Title: None
--------------------
Author: Elizabeth Beirne
Title: None
--------------------
Author: Mohammed
Title: None
--------------------
Author: Magda Fellino
Title: None
--------------------
Author: Stephen
Title: Con artists
--------------------
Author: Adam Cadman
Title: Lidl - my most useful supermarket!
--------------------
Author: Christine Espley
Title: Nordic “promotion” my aunt sally.
--------------------
Author: Mohammed
Title: Arrogant aggressive accusing blaming dishonest managers
--------------------
Author: J Q
Title: Lawrence Hill store
--------------------
Author: Elizabeth Beirne
Title: huge clump of hair in cretan pies…nice
--------------------


### Extracting the review date

In [95]:
ex_article = soup.select('article')[5]
p_tag = ex_article.find('p', class_="typography_body-m__k2UI7 typography_appearance-default__t8iAq",
                        attrs={"data-service-review-date-of-experience-typography": "true"})
span_tag = p_tag.find('span', class_="typography_body-m__k2UI7 typography_appearance-subtle__PYOVM")

In [96]:
for article in soup.select('article')[:10]:
    try:
        # Correct class name formatting with proper string continuation
        author_tag = article.find('span', 
            class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq")
        author = author_tag.text.strip() if author_tag else None
        
        title_tag = article.find('h2')
        title = title_tag.text.strip() if title_tag else None
        
        # Fix attribute selector syntax and add error handling
        date_element = article.select_one('[data-service-review-date-of-experience-typography]')
        date_text = date_element.text if date_element else "N/A"
        dates = date_text.split(':')[-1].strip() if ':' in date_text else date_text

    except Exception as inner_e:
        print(f"Error processing a review: {inner_e}")
        continue  # Skip to next review if error occurs

    # Print statements moved outside the try block
    print(f"Author: {author}")
    print(f"Title: {title}")
    print(f"Date: {dates}")
    print("-" * 20)

Author: J Q
Title: None
Date: N/A
--------------------
Author: Elizabeth Beirne
Title: None
Date: N/A
--------------------
Author: Mohammed
Title: None
Date: N/A
--------------------
Author: Magda Fellino
Title: None
Date: N/A
--------------------
Author: Stephen
Title: Con artists
Date: 21 March 2025
--------------------
Author: Adam Cadman
Title: Lidl - my most useful supermarket!
Date: 21 March 2025
--------------------
Author: Christine Espley
Title: Nordic “promotion” my aunt sally.
Date: 15 March 2025
--------------------
Author: Mohammed
Title: Arrogant aggressive accusing blaming dishonest managers
Date: 04 February 2025
--------------------
Author: J Q
Title: Lawrence Hill store
Date: 20 March 2025
--------------------
Author: Elizabeth Beirne
Title: huge clump of hair in cretan pies…nice
Date: 19 March 2025
--------------------


### Extracting review rating and review text.

In [112]:
for article in soup.select('article')[:10]:
    try:
        author_tag = article.find('span', 
                                  class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq")
        author = author_tag.text.strip() if author_tag else None  # Fixed variable name check
        
        title_tag = article.find('h2')  # Replace 'h2' with the correct tag (e.g., 'h3', 'div')
        title = title_tag.text.strip() if title_tag else None
        
        # Date
        date_element = article.select_one('[data-service-review-date-of-experience-typography]')
        dates = date_element.text.split(':')[-1].strip() if date_element else "N/A"
        
        # Rating
        rating_element = article.select_one('[data-service-review-rating] img')
        rating = rating_element['alt'] if rating_element else "N/A"  # Removed trailing comma
        
        # Review text
        review_text_element = article.select_one('[data-service-review-text-typography]')
        review_text = review_text_element.text.strip() if review_text_element else None
        

    except Exception as inner_e:
        print(f"Error processing a review: {inner_e}")
        continue  # Skip to next review if error occurs

    # Print statements moved outside the try block for clarity
    print(f"Author: {author}")
    print(f"Title: {title}")
    print(f"Date: {dates}")
    print(f"Rating: {rating}")
    print(f"Review Text: {review_text}")
    print("-" * 40)  # Separator between reviews

Author: J Q
Title: None
Date: N/A
Rating: N/A
Review Text: None
----------------------------------------
Author: Elizabeth Beirne
Title: None
Date: N/A
Rating: N/A
Review Text: None
----------------------------------------
Author: Mohammed
Title: None
Date: N/A
Rating: N/A
Review Text: None
----------------------------------------
Author: Magda Fellino
Title: None
Date: N/A
Rating: N/A
Review Text: None
----------------------------------------
Author: Stephen
Title: Con artists
Date: 21 March 2025
Rating: Rated 1 out of 5 stars
Review Text: Con artists, offer you a free item etc, but if you buy more than one eligible they take the cheapest item off...joke
----------------------------------------
Author: Adam Cadman
Title: Lidl - my most useful supermarket!
Date: 21 March 2025
Rating: Rated 5 out of 5 stars
Review Text: I use my local Lidl (Binley Road, Coventry) for most of my shopping as it's within walking distance.The staff are friendly, and many recognise me and say Hi.The store, r

## Saving All Reviews to Dataframe

In [100]:
import pandas as pd

In [114]:
# List to store all reviews
all_reviews = []

# Loop through pages (adjust 'max_pages' as needed)
max_pages = 5  # Change this to scrape more pages
for page_num in range(1, max_pages + 1):
    try:
        # Construct URL with page number
        url = f"https://uk.trustpilot.com/review/www.lidl.co.uk?page={page_num}"
        
        # Extract reviews from current page
        articles = soup.select('article')
        
        for article in articles:
            try:
                # Extract author
                
                author_tag = article.find('span',
                                  class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq")
                author = author_tag.text.strip() if author_tag else None
            
                
                # Extract title
                title_tag = article.find('h2')
                title = title_tag.text.strip() if title_tag else None  # Added .strip() for cleanliness
                
                # Extract date
                date_element = article.select_one('[data-service-review-date-of-experience-typography]')
                date_text = date_element.text if date_element else "N/A"
                review_date = date_text.split(':')[-1].strip() if ':' in date_text else date_text
                
                
                # Extract rating
                rating_element = article.select_one('[data-service-review-rating] img')
                rating = rating_element['alt'] if rating_element else "N/A"
                
                # Extract review text
                review_text_element = article.select_one('[data-service-review-text-typography]')
                review_text = review_text_element.text.strip() if review_text_element else None
                
                # Create dictionary and append to list
                review_dict = {
                    'review_author': author,
                    'review_date_original': review_date,
                    'review_title': title,
                    'review_rating': rating,
                    'review_text': review_text,
                    'page_number': page_num
                }
                all_reviews.append(review_dict)
                
            except Exception as e:
                print(f"Error processing a review on page {page_num}: {e}")
                continue  # Skip to next review
            
        # Add delay to avoid IP blocking
#         time.sleep(2)
        
    except Exception as e:
        print(f"Error on page {page_num}: {e}")
        continue

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(all_reviews)

In [115]:
df

Unnamed: 0,review_author,review_date_original,review_title,review_rating,review_text,page_number
0,J Q,,,,,1
1,Elizabeth Beirne,,,,,1
2,Mohammed,,,,,1
3,Magda Fellino,,,,,1
4,Stephen,21 March 2025,Con artists,Rated 1 out of 5 stars,"Con artists, offer you a free item etc, but if...",1
...,...,...,...,...,...,...
115,Paul,15 March 2025,Lidl Twickenham please sell to Aldi.,Rated 1 out of 5 stars,Lidl Twickenham removed numerous tills to inst...,5
116,Sweetpea,15 March 2025,Taunton Lidls lettuce not chilled- now floppy...,Rated 1 out of 5 stars,"Taunton Lidls now, for some weeks, keeps lettu...",5
117,Mark Hennessy,24 February 2025,Lidl in Fakenham in Norfolk has great…,Rated 5 out of 5 stars,Lidl in Fakenham in Norfolk has great staff wh...,5
118,MR D STALKER,16 March 2025,Frustrating,Rated 3 out of 5 stars,I do like Lidl but find it very frustrating wh...,5
