<a href="https://colab.research.google.com/github/rwlopez98/Data-Scraping-Project/blob/main/Scraper_BeautifulSoup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
from google.colab import files
import os

from google.colab import drive
drive.mount('/content/drive')

# Function to get post data from a page
def get_posts_from_page(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the posts (adjusted based on HTML structure)
    posts = soup.find_all('li', class_='Item ItemComment noPhotoWrap Role_Member pageBox')  # Adjust if necessary
    post_data = []

    for post in posts:
        try:
            # Extract user ID (from the profile link)
            user_id_tag = post.find('a', class_='Username js-userCard')
            if user_id_tag:
                user_id = user_id_tag.get_text(strip=True)
            else:
                user_id = None

            # Extract date (using the time tag)
            date_tag = post.find('time')
            if date_tag:
                date = date_tag.get('title')  # 'title' attribute contains the full date
            else:
                date = None

            # Extract comment content
            comment_tag = post.find('div', class_='Message userContent')
            if comment_tag:
                comment = comment_tag.get_text(strip=True)
            else:
                comment = None

            # Append the post data to the list
            post_data.append((user_id, date, comment))

        except Exception as e:
            print(f"Error extracting data for a post: {e}")
            continue

    return post_data

# Function to scrape posts across multiple pages
def scrape_edmunds_forum(base_url, start_page, end_page):
    all_posts = []

    for page_num in range(start_page, end_page - 1, -1):
        print(f"Scraping page {page_num}...")
        page_url = f"{base_url}/p{page_num}"  # Construct the page URL
        posts = get_posts_from_page(page_url)
        all_posts.extend(posts)

        if len(all_posts) >= 5000:  # Stop when we reach 5000 posts
            break

    return all_posts

# URL of the forum page (starting from the most recent page, e.g., page 435)
base_url = 'https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans'

# Scrape the posts
posts = scrape_edmunds_forum(base_url, start_page=435, end_page=1)

# Create a DataFrame with columns: user_id, date, comment
df = pd.DataFrame(posts, columns=["user_id", "date", "comment"])

# Rename the index column (level_0) to 'index'
df.reset_index(inplace=True)
df.rename(columns={'level_0': 'index'}, inplace=True)

# Define the file path to save the CSV
file_path = '/content/drive/MyDrive/Colab Notebooks/UT Assignments/Analytics for Unstructured Data/Assignment 1/edmunds_forum_posts_with_user_data.csv'

# Export to CSV to the specified location
df.to_csv(file_path, index=False)

print(f"Successfully scraped and saved {len(posts)} posts to DataFrame and exported to CSV at {file_path}")

# download the file to your local machine
files.download('edmunds_forum_posts_with_user_data.csv')

Mounted at /content/drive
Scraping page 435...
Scraping page 434...
Scraping page 433...
Scraping page 432...
Scraping page 431...
Scraping page 430...
Scraping page 429...
Scraping page 428...
Scraping page 427...
Scraping page 426...
Scraping page 425...
Scraping page 424...
Scraping page 423...
Scraping page 422...
Scraping page 421...
Scraping page 420...
Scraping page 419...
Scraping page 418...
Scraping page 417...
Scraping page 416...
Scraping page 415...
Scraping page 414...
Scraping page 413...
Scraping page 412...
Scraping page 411...
Scraping page 410...
Scraping page 409...
Scraping page 408...
Scraping page 407...
Scraping page 406...
Scraping page 405...
Scraping page 404...
Scraping page 403...
Scraping page 402...
Scraping page 401...
Scraping page 400...
Scraping page 399...
Scraping page 398...
Scraping page 397...
Scraping page 396...
Scraping page 395...
Scraping page 394...
Scraping page 393...
Scraping page 392...
Scraping page 391...
Scraping page 390...
Scraping

FileNotFoundError: Cannot find file: edmunds_forum_posts_with_user_data.csv