## Downloading the Dataset

This code downloads the latest dataset and moves it to a local folder automatically (you need to be logged in for this to work).

The description of the dataset (and each individual column) is available at https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data

In [1]:
import os
import shutil
import kagglehub

output_dir = "dataset/"
os.makedirs(output_dir, exist_ok=True)

# Check if dataset already exists
if os.listdir(output_dir):
    print(f"Dataset already exists in '{output_dir}', skipping download.")
else:
    # Download the latest version
    path = kagglehub.dataset_download("austinreese/craigslist-carstrucks-data")
    print("Path to downloaded dataset files:", path)

    # Copy all files from the KaggleHub path to your local dataset folder
    for item in os.listdir(path):
        s = os.path.join(path, item)
        d = os.path.join(output_dir, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, dirs_exist_ok=True)
        else:
            shutil.copy2(s, d)

    print("Dataset moved to:", os.path.abspath(output_dir))

Dataset already exists in 'dataset/', skipping download.


  from .autonotebook import tqdm as notebook_tqdm


## Domain

## What

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('dataset/vehicles.csv', delimiter=',')

print("Number of rows:", len(df))
print("Number of columns:", len(df.columns))

Number of rows: 426880
Number of columns: 26


In [3]:




df.head(5)

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


In [None]:
## checking each column for missing values, unique values, most frequent value, string lengths, etc

def summarize_columns(df):
    n = len(df)
    rows = []
    for col in df.columns:
        ser = df[col]
        dtype = ser.dtype
        non_null = ser.count()
        missing = n - non_null
        missing_pct = missing / n if n > 0 else np.nan
        unique = ser.nunique(dropna=True)
        is_cat = isinstance(ser.dtype, pd.CategoricalDtype)
                
        row = {
            "dtype": str(dtype),
            "non_null": non_null,
            "missing": missing,
            "missing_pct": round(missing_pct, 4),
            "unique": unique,
            "is_categorical": is_cat                       
        }

        mode = ser.mode(dropna=True)
        top = mode.iloc[0] if not mode.empty else np.nan
        vc = ser.value_counts(dropna=True)
        top_freq = int(vc.iloc[0]) if not vc.empty else 0
        str_ser = ser.dropna().astype(str)
        
        if not str_ser.empty:
            lens = str_ser.map(len)
            row.update({
                "top": top,
                "top_freq": top_freq,
                "avg_len": lens.mean(),
                "min_len": lens.min(),
                "max_len": lens.max()
            })
        else:
            row.update({
                "top": top,
                "top_freq": top_freq,
                "avg_len": np.nan,
                "min_len": np.nan,
                "max_len": np.nan
            })

        rows.append({"column": col, **row})

    summary = pd.DataFrame(rows).set_index("column").sort_index()
    return summary

summary = summarize_columns(df)

display(summary)

Unnamed: 0_level_0,dtype,non_null,missing,missing_pct,unique,is_categorical,top,top_freq,avg_len,min_len,max_len
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
VIN,object,265838,161042,0.3773,118264,False,1FMJU1JT1HEA52352,261,16.971208,1.0,24.0
condition,object,252776,174104,0.4079,6,False,good,121456,6.344151,3.0,9.0
county,float64,0,426880,1.0,0,False,,0,,,
cylinders,object,249202,177678,0.4162,8,False,6 cylinders,94169,10.975426,5.0,12.0
description,object,426810,70,0.0002,360911,False,35 VEHICLES PRICED UNDER $3000!!! BIG TIME! T...,231,2972.273297,1.0,28832.0
drive,object,296313,130567,0.3059,3,False,4wd,131904,3.0,3.0,3.0
fuel,object,423867,3013,0.0071,5,False,gas,356209,3.41438,3.0,8.0
id,int64,426880,0,0.0,426880,False,7207408119,1,10.0,10.0,10.0
image_url,object,426812,68,0.0002,241899,False,https://images.craigslist.org/00N0N_1xMPvfxRAI...,7357,66.903428,57.0,67.0
lat,float64,420331,6549,0.0153,53181,False,33.779214,3301,7.681584,4.0,10.0


## Why

## How

## Group Dynamics