### Script Purpose

- Collect ~5000 or more messages posted in Edmunds.com discussion forums on cars.
- Present the results in a clean .csv file with three columns: date, userid and message. 

https://forums.edmunds.com/discussion/4011/general/x/chronic-car-buyers-anonymous

In [1]:
import requests
import time
from bs4 import BeautifulSoup

import pandas as pd

In [2]:
# User-defined function for Scraping the data

def getData(url):
    page = requests.get(url, headers = {'My-Header':'value'})
    return page.text

In [3]:
# Scrape all the pages

url = "https://forums.edmunds.com/discussion/4011/general/x/chronic-car-buyers-anonymous/p"

authors = []
messages = []
dates = []

# Extract recent comments
for page in range(2560,2860): 

    # First extract the page
    page_url = url + str(page)
    soup = BeautifulSoup(getData(page_url), 'html.parser')

    # In each page, scrape the comments
    # Each comment: <div class="Comment">
        # Author: <span class="Author"> <a title="userid">
        # Date: <span class="MItem DateCreated"> <time title= >
        # Message: <div class="Message userContent">

    comments = soup.find_all('div',class_ = 'Comment')

    for comment in comments:
        a_tag = comment.find("a", class_="Username js-userCard")
        if a_tag:
            authors.append(a_tag.text)

        t_tag = comment.find("time")
        if t_tag:
            dates.append(t_tag['title'])

        m_tag = comment.find('div', class_="Message userContent")
        if m_tag:
            messages.append(m_tag.text)
    
comments = pd.DataFrame(list(zip(dates,authors,messages)), columns = ['date','user_id','message'])
comments.head()


Unnamed: 0,date,user_id,message
0,"November 30, 2021 10:59AM",stickguy,\nJust watched this. Seems like GTI is better ...
1,"November 30, 2021 11:09AM",explorerx4,"\nStickguy,Did you see the video of TFL testin..."
2,"November 30, 2021 11:14AM",oldfarmer50,\nqbrozen said: kyfdx said: qbrozen said:wonde...
3,"November 30, 2021 11:16AM",tjc78,"\nLooking at the Lexus deal, I wonder it I wou..."
4,"November 30, 2021 11:17AM",oldfarmer50,\nstickguy said:Beautiful example of one of th...


In [4]:
comments = pd.DataFrame(list(zip(dates,authors,messages)), columns = ['date','user_id','message'])
comments.head()

Unnamed: 0,date,user_id,message
0,"November 30, 2021 10:59AM",stickguy,\nJust watched this. Seems like GTI is better ...
1,"November 30, 2021 11:09AM",explorerx4,"\nStickguy,Did you see the video of TFL testin..."
2,"November 30, 2021 11:14AM",oldfarmer50,\nqbrozen said: kyfdx said: qbrozen said:wonde...
3,"November 30, 2021 11:16AM",tjc78,"\nLooking at the Lexus deal, I wonder it I wou..."
4,"November 30, 2021 11:17AM",oldfarmer50,\nstickguy said:Beautiful example of one of th...


In [5]:
# Extract only 2022 data
comments['date'] = pd.to_datetime(comments['date']) 

comments = comments[ (comments['date'] >= pd.Timestamp(2022,1,1)) & (comments['date'] < pd.Timestamp(2023,1,1))]

comments.head()

Unnamed: 0,date,user_id,message
1380,2022-01-01 03:53:00,benjaminh,\nAlthough the acceleration times in most car ...
1381,2022-01-01 03:57:00,tjc78,\n\n@stickguy said:\nXM only comes on it if yo...
1382,2022-01-01 04:08:00,tjc78,\n\n@qbrozen said:\nI don’t have the room for ...
1383,2022-01-01 04:48:00,graphicguy,\nCongrats @stickguy ……very cool!
1384,2022-01-01 06:03:00,au1994,\nHappy New Year all!Congrats @stickguy. I rea...


In [6]:
comments.shape

(13503, 3)

In [7]:
comments.to_csv('forum_comments_2022.csv', index=False)
