Note to whomever looks at this: the plotly dashboard behaves very weird at least in part because of the jupyter nb. Consider moving this code to .py and using client browser to view.

In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
import time
import dash
from dash import dcc, html
import plotly.express as px

# Set up Selenium WebDriver
options = Options()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Define search parameters
city = "knoxville"
make_model = "Toyota Camry"
min_year = 2010
max_year = 2022
by_owner = True

# Construct search URL
base_url = f"https://{city}.craigslist.org/search/cta"
params = []
if make_model:
    params.append(f"query={make_model.replace(' ', '+')}")
if min_year:
    params.append(f"min_auto_year={min_year}")
if max_year:
    params.append(f"max_auto_year={max_year}")
if by_owner:
    params.append("purveyor=owner")
search_url = base_url + "?" + "&".join(params)

# Fetch the page
print(f"Opening URL: {search_url}")
driver.get(search_url)
time.sleep(3)  # Allow JavaScript to load
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()
# Extract listing URLs and prices
listings = []
for listing in soup.select("ol > div"):  # Adjust selector based on actual structure
    url_tag = listing.find("a", class_="main", href=True)
    price_tag = listing.find("span", class_="priceinfo")
    
    if url_tag and price_tag:
        url = url_tag["href"]
        price = price_tag.text.strip()
        listings.append({"url": url, "price": price})

Opening URL: https://knoxville.craigslist.org/search/cta?query=Toyota+Camry&min_auto_year=2010&max_auto_year=2022&purveyor=owner


In [17]:
# Extract additional details from each listing page
detailed_listings = []
driver = webdriver.Chrome(service=service, options=options)
for listing in listings[0:5]:
    driver.get(listing["url"])
    time.sleep(2)
    listing_soup = BeautifulSoup(driver.page_source, "html.parser")
    
    attributes = {}
    for attr in listing_soup.select(".attrgroup .attr"):
        label = attr.find("span", class_="labl")
        value = attr.find("span", class_="valu")
        if label and value:
            attributes[label.text.strip()] = value.text.strip()
    
    detailed_listings.append({"url": listing["url"], "price": listing["price"], **attributes})

driver.quit()

# Convert to DataFrame and display
print(f"Found {len(detailed_listings)} listings with details")
df = pd.DataFrame(detailed_listings)

Found 5 listings with details


In [18]:
df.columns = [col.replace(':','') for col in df.columns]

In [19]:
# Convert price to a numeric value
df["price"] = df["price"].replace(r"[\$,]", "", regex=True).astype(float)
# Convert odometer to a numeric value
df["odometer"] = df["odometer"].replace(r",", "", regex=True).astype(float)
# Extract numeric value from cylinders (e.g., "4 cylinders" -> 4)
df["cylinders"] = df["cylinders"].str.extract(r"(\d+)").astype(float)

In [24]:
# Create a Dash app
app = dash.Dash(__name__)
fig = px.scatter(df, x="odometer", y="price", color="condition", hover_data=["url"])
fig.update_yaxes(dtick=5000)

@app.callback(
    dash.dependencies.Output('dummy-output', 'children'),
    [dash.dependencies.Input('shutdown-button', 'n_clicks')]
)
def shutdown_server(n_clicks):
    if n_clicks:
        os._exit(0)
    return ""

app.layout = html.Div([
    html.Button("Shutdown Server", id="shutdown-button"),
    html.Div(id="dummy-output"),
    html.H1("Craigslist Car Listings Dashboard"),
    dcc.Graph(id="price-vs-odometer", figure=fig),
])

app.run_server(debug=True)

In [25]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Drop rows with missing values in relevant columns
df = df.dropna(subset=["price", "odometer"])

# Prepare data for regression
X = df["odometer"].values.reshape(-1, 1)  # Odometer as independent variable
y = df["price"].values  # Price as dependent variable

# Fit Linear Regression Model
model = LinearRegression()
model.fit(X, y)

# Predict prices
df["predicted_price"] = model.predict(X)

# Calculate price deviation
df["price_deviation"] = df["price"] - df["predicted_price"]

# Add regression line points
x_range = np.linspace(df["odometer"].min(), df["odometer"].max(), 100).reshape(-1, 1)
y_pred_range = model.predict(x_range)

# Create a Plotly figure
fig = px.scatter(
    df,
    x="odometer",
    y="price",
    color=df["price_deviation"].apply(lambda x: "Above Expected" if x > 0 else "Below Expected"),
    hover_data=["url"],
)

# Add the regression line
fig.add_traces(
    px.line(
        x=x_range.flatten(),
        y=y_pred_range,
        labels={"x": "odometer", "y": "price"},
    ).data
)

fig.update_yaxes(dtick=5000)  # Set y-axis ticks to every $5000