In [52]:
from bs4 import BeautifulSoup
import mechanize

### Creating a Browser Instance

In [53]:
br = mechanize.Browser()
br.set_handle_robots(False)  # Google demands a user-agent that isn't a robot

In [54]:
br.addheaders = [("User-agent", "Edge")]  # if chrome is not installed, you may also select 'firefox'

### Opening a web page

In [64]:
max_pages = 20  # Change this to scrape more pages

for page_num in [5,6]:
    try:
        # Construct URL with page number
        url = f"https://uk.trustpilot.com/review/www.lidl.co.uk?page={page_num}"

        # Fetch and parse the current page
        response = br.open(url)
        web_page = response.read()
        soup = BeautifulSoup(web_page, "html.parser")

    except Exception as e:
        print(f"Error on page {page_num}: {e}")
        continue  # Skip to the next page if an error occurs

In [65]:
# Accessing Lidl web page
# response = br.open("https://uk.trustpilot.com/review/www.lidl.co.uk")

## Exercise 2: Scraping the Reviews

In [66]:
# web_page = response.read()
# soup = BeautifulSoup(web_page, "html.parser")

In [67]:
# This gets all the reviews from the store
#soup.select('article')

In [68]:
# This gets the first review from the store
#soup.select('article')[:1]

### Getting Authors of the reviews

In [69]:
# Iterate through each <article> tag (each review) 
for article in soup.select('article'):
    # finding the tags associated to the article
    author_element = article.find('span', class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq", 
                                  attrs={"data-consumer-name-typography": "true"})
    if author_element:
        review_author = author_element.text.strip()
        print(review_author)
    else:
        print("No Title Found!")

Frazer
Muhammad Zeeshan
Debs
J Q
Ben Flair
Tommy Bradshaw
Michael Scrancher
Mr Houghton
Lynn Smith
T Laver
ABSquared
Karl Bennett
Szymon Gruber
Oscar Clarkson
Amanda Payne
Chris Dee
Kathsue
Imcrewer
Mark o brien
Owen Daniels
Lynn
Mustafa El-gabi
Sandra
Geo Laz


### Getting the title of review

In [70]:
# Iterate through each <article> tag (each review) 
for article in soup.select('article'):
    # finding the tags associated to the review title
    title_element = article.find('h2', class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq", 
                                  attrs={"data-service-review-title-typography": "true"})
    if title_element:
        title = title_element.text.strip()
        print(title)
    else:
        print("No Title Found!")

No Title Found!
No Title Found!
No Title Found!
No Title Found!
Fantastic customer service in The Oxford Road (Reading) branch.
Don't go there…
Excellent staff
Was in the Musselburgh (EH21 6QD) and…
I have visited Lidl at park road many…
Ever more expensive and ever worse quality
0 customer service
Not the best experience
Three promotions
The workers were extremely rude
Very disappointing experience in Lidl Green st
Local branch in freefall.. Bye.
Not always good value
The staff were rude
Aggressive store security guard risk to public safety
Got kicked out for nothing when asked…
Their policy of only exchanging tinned…
Terrible nectarines
Worst customer service ever
Angry manager lost his temper in Lidl Beckton Store


### Combining the code to get both author and review title

In [71]:
for article in soup.select('article'):
    try:
        # Correctly extract author using proper class name
        author_tag = article.find('span', class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq")
        author = author_tag.text.strip() if author_tag else None  # Fixed ternary operator syntax
        
        # Extract title from h2 tag with proper handling
        title_tag = article.find('h2')
        title = title_tag.text.strip() if title_tag else None  # Added .strip() for cleanliness

    except Exception as inner_e:
        print(f"Error processing a review: {inner_e}")
        continue  # Skip to next iteration if error occurs

    # Print statements moved outside the try-except block
    print(f"Author: {author}")
    print(f"Title: {title}")
    print("-" * 20)  # Separator between reviews

Author: Frazer
Title: None
--------------------
Author: Muhammad Zeeshan
Title: None
--------------------
Author: Debs
Title: None
--------------------
Author: J Q
Title: None
--------------------
Author: Ben Flair
Title: Fantastic customer service in The Oxford Road (Reading) branch.
--------------------
Author: Tommy Bradshaw
Title: Don't go there…
--------------------
Author: Michael Scrancher
Title: Excellent staff
--------------------
Author: Mr Houghton
Title: Was in the Musselburgh (EH21 6QD) and…
--------------------
Author: Lynn Smith
Title: I have visited Lidl at park road many…
--------------------
Author: T Laver
Title: Ever more expensive and ever worse quality
--------------------
Author: ABSquared
Title: 0 customer service
--------------------
Author: Karl Bennett
Title: Not the best experience
--------------------
Author: Szymon Gruber
Title: Three promotions
--------------------
Author: Oscar Clarkson
Title: The workers were extremely rude
--------------------
Author: 

### Extracting the review date

In [72]:
ex_article = soup.select('article')[5]
p_tag = ex_article.find('p', class_="typography_body-m__k2UI7 typography_appearance-default__t8iAq",
                        attrs={"data-service-review-date-of-experience-typography": "true"})
span_tag = p_tag.find('span', class_="typography_body-m__k2UI7 typography_appearance-subtle__PYOVM")

In [73]:
for article in soup.select('article'):
    try:
        # Correct class name formatting with proper string continuation
        author_tag = article.find('span', 
            class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq")
        author = author_tag.text.strip() if author_tag else None
        
        title_tag = article.find('h2')
        title = title_tag.text.strip() if title_tag else None
        
        # Fix attribute selector syntax and add error handling
        date_element = article.select_one('[data-service-review-date-of-experience-typography]')
        date_text = date_element.text if date_element else "N/A"
        dates = date_text.split(':')[-1].strip() if ':' in date_text else date_text

    except Exception as inner_e:
        print(f"Error processing a review: {inner_e}")
        continue  # Skip to next review if error occurs

    # Print statements moved outside the try block
    print(f"Author: {author}")
    print(f"Title: {title}")
    print(f"Date: {dates}")
    print("-" * 20)

Author: Frazer
Title: None
Date: N/A
--------------------
Author: Muhammad Zeeshan
Title: None
Date: N/A
--------------------
Author: Debs
Title: None
Date: N/A
--------------------
Author: J Q
Title: None
Date: N/A
--------------------
Author: Ben Flair
Title: Fantastic customer service in The Oxford Road (Reading) branch.
Date: 23 January 2025
--------------------
Author: Tommy Bradshaw
Title: Don't go there…
Date: 20 December 2024
--------------------
Author: Michael Scrancher
Title: Excellent staff
Date: 12 February 2025
--------------------
Author: Mr Houghton
Title: Was in the Musselburgh (EH21 6QD) and…
Date: 11 February 2025
--------------------
Author: Lynn Smith
Title: I have visited Lidl at park road many…
Date: 10 February 2025
--------------------
Author: T Laver
Title: Ever more expensive and ever worse quality
Date: 10 February 2025
--------------------
Author: ABSquared
Title: 0 customer service
Date: 10 February 2025
--------------------
Author: Karl Bennett
Title: Not

### Extracting review rating and review text.

In [74]:
for article in soup.select('article'):
    try:
        author_tag = article.find('span', 
                                  class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq")
        author = author_tag.text.strip() if author_tag else None  # Fixed variable name check
        
        title_tag = article.find('h2')  # Replace 'h2' with the correct tag (e.g., 'h3', 'div')
        title = title_tag.text.strip() if title_tag else None
        
        # Date
        date_element = article.select_one('[data-service-review-date-of-experience-typography]')
        dates = date_element.text.split(':')[-1].strip() if date_element else "N/A"
        
        # Rating
        rating_element = article.select_one('[data-service-review-rating] img')
        rating = rating_element['alt'] if rating_element else "N/A"  # Removed trailing comma
        
        # Review text
        review_text_element = article.select_one('[data-service-review-text-typography]')
        review_text = review_text_element.text.strip() if review_text_element else None
        

    except Exception as inner_e:
        print(f"Error processing a review: {inner_e}")
        continue  # Skip to next review if error occurs

    # Print statements moved outside the try block for clarity
    print(f"Author: {author}")
    print(f"Title: {title}")
    print(f"Date: {dates}")
    print(f"Rating: {rating}")
    print(f"Review Text: {review_text}")
    print("-" * 40)  # Separator between reviews

Author: Frazer
Title: None
Date: N/A
Rating: N/A
Review Text: None
----------------------------------------
Author: Muhammad Zeeshan
Title: None
Date: N/A
Rating: N/A
Review Text: None
----------------------------------------
Author: Debs
Title: None
Date: N/A
Rating: N/A
Review Text: None
----------------------------------------
Author: J Q
Title: None
Date: N/A
Rating: N/A
Review Text: None
----------------------------------------
Author: Ben Flair
Title: Fantastic customer service in The Oxford Road (Reading) branch.
Date: 23 January 2025
Rating: Rated 5 out of 5 stars
Review Text: Fantastic customer service in The Reading, Oxford Road branch from a young man called Elvis. Though busy, he was working hard to keep the queue moving and was very friendly!
----------------------------------------
Author: Tommy Bradshaw
Title: Don't go there…
Date: 20 December 2024
Rating: Rated 1 out of 5 stars
Review Text: Lidl Folkestone Kent,food off before the use by date, mainly because most of the

## Saving All Reviews to Dataframe

In [75]:
import pandas as pd

In [76]:
# List to store all reviews
all_reviews = []

# Extract reviews from current page
articles = soup.select('article')

for article in articles:
    try:
        # Extract author
        author_tag = article.find('span',
                                 class_="typography_heading-xs__osRhC typography_appearance-default__t8iAq")
        author = author_tag.text.strip() if author_tag else None

        # Extract title
        title_tag = article.find('h2')
        title = title_tag.text.strip() if title_tag else None  # Added .strip() for cleanliness

        # Extract date
        date_element = article.select_one('[data-service-review-date-of-experience-typography]')
        date_text = date_element.text if date_element else "N/A"
        review_date = date_text.split(':')[-1].strip() if ':' in date_text else date_text

        # Extract rating
        rating_element = article.select_one('[data-service-review-rating] img')
        rating = rating_element['alt'] if rating_element else "N/A"

        # Extract review text
        review_text_element = article.select_one('[data-service-review-text-typography]')
        review_text = review_text_element.text.strip() if review_text_element else None

        # Create dictionary and append to list
        review_dict = {
            'review_author': author,
            'review_date_original': review_date,
            'review_title': title,
            'review_rating': rating,
            'review_text': review_text,
            'page_number': page_num
        }
        all_reviews.append(review_dict)

    except Exception as e:
        print(f"Error processing a review on page {page_num}: {e}")
        continue  # Skip to next review

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(all_reviews)

In [77]:
# Cleaning the 'review_rating' column
df['rating'] = df['review_rating'].apply(
    lambda x: x.split()[1] if x != 'N/A' else x
)

In [78]:
# reordering the columns
new_column_order = [
    'review_author',    # Author name
    'review_title',     # Review title
    'review_date_original',  # Date of review
    'rating',    # Numerical rating (e.g., "5")
    'review_text',      # Full review text
    'page_number'       # Page number scraped from
]

df = df.reindex(columns=new_column_order)

# Converting the column to numeric values
df['rating']= pd.to_numeric(df['rating'], errors='coerce')

In [79]:
df

Unnamed: 0,review_author,review_title,review_date_original,rating,review_text,page_number
0,Frazer,,,,,6
1,Muhammad Zeeshan,,,,,6
2,Debs,,,,,6
3,J Q,,,,,6
4,Ben Flair,Fantastic customer service in The Oxford Road ...,23 January 2025,5.0,"Fantastic customer service in The Reading, Oxf...",6
5,Tommy Bradshaw,Don't go there…,20 December 2024,1.0,"Lidl Folkestone Kent,food off before the use b...",6
6,Michael Scrancher,Excellent staff,12 February 2025,5.0,"Excellent staff , good prices just a pity that...",6
7,Mr Houghton,Was in the Musselburgh (EH21 6QD) and…,11 February 2025,1.0,Was in the Musselburgh (EH21 6QD) and The mana...,6
8,Lynn Smith,I have visited Lidl at park road many…,10 February 2025,5.0,I have visited Lidl at park road many times an...,6
9,T Laver,Ever more expensive and ever worse quality,10 February 2025,1.0,Ever more expensive (sometimes +50% or even mo...,6


In [80]:
df[df['rating'] == 5]

Unnamed: 0,review_author,review_title,review_date_original,rating,review_text,page_number
4,Ben Flair,Fantastic customer service in The Oxford Road ...,23 January 2025,5.0,"Fantastic customer service in The Reading, Oxf...",6
6,Michael Scrancher,Excellent staff,12 February 2025,5.0,"Excellent staff , good prices just a pity that...",6
8,Lynn Smith,I have visited Lidl at park road many…,10 February 2025,5.0,I have visited Lidl at park road many times an...,6
