In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('vehicles.csv')

In [3]:
df.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


In [4]:
print('Dataset Info:')
df.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 26 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            426880 non-null  int64  
 1   url           426880 non-null  object 
 2   region        426880 non-null  object 
 3   region_url    426880 non-null  object 
 4   price         426880 non-null  int64  
 5   year          425675 non-null  float64
 6   manufacturer  409234 non-null  object 
 7   model         421603 non-null  object 
 8   condition     252776 non-null  object 
 9   cylinders     249202 non-null  object 
 10  fuel          423867 non-null  object 
 11  odometer      422480 non-null  float64
 12  title_status  418638 non-null  object 
 13  transmission  424324 non-null  object 
 14  VIN           265838 non-null  object 
 15  drive         296313 non-null  object 
 16  size          120519 non-null  object 
 17  type          334022 non-null  obj

In [5]:
print('\nMissing Values:')
print(df.isnull().sum())


Missing Values:
id                   0
url                  0
region               0
region_url           0
price                0
year              1205
manufacturer     17646
model             5277
condition       174104
cylinders       177678
fuel              3013
odometer          4400
title_status      8242
transmission      2556
VIN             161042
drive           130567
size            306361
type             92858
paint_color     130203
image_url           68
description         70
county          426880
state                0
lat               6549
long              6549
posting_date        68
dtype: int64


In [6]:
# Calculate and display percentage of missing values for all columns
missing_percentages = (df.isnull().sum()/len(df)*100)  # Calculate percentage of missing values
missing_df = missing_percentages.to_frame(name='Missing Values')  # Convert to dataframe
missing_sorted = missing_df.sort_values('Missing Values', ascending=False)  # Sort by missing values percentage
print(missing_sorted)

              Missing Values
county            100.000000
size               71.767476
cylinders          41.622470
condition          40.785232
VIN                37.725356
drive              30.586347
paint_color        30.501078
type               21.752717
manufacturer        4.133714
title_status        1.930753
lat                 1.534155
long                1.534155
model               1.236179
odometer            1.030735
fuel                0.705819
transmission        0.598763
year                0.282281
description         0.016398
posting_date        0.015930
image_url           0.015930
region_url          0.000000
url                 0.000000
id                  0.000000
region              0.000000
price               0.000000
state               0.000000


In [7]:
df.shape

(426880, 26)

In [8]:
def show_detailed_categories(df, top_n=10):
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns
    
    for col in categorical_columns:
        print(f"\nAnalysis for: {col}")
        print("-" * 50)
        
        # Get value counts and percentages
        counts = df[col].value_counts()
        percentages = df[col].value_counts(normalize=True) * 100
        
        # Create summary DataFrame
        summary = pd.DataFrame({
            'Count': counts,
            'Percentage': percentages.round(2),
            'Cumulative_Percentage': percentages.cumsum().round(2)
        })
        
        # Show top N categories
        print(f"Top {top_n} categories:")
        print(summary.head(top_n))
        
        # Show basic statistics
        print(f"\nTotal categories: {len(counts)}")
        print(f"Missing values: {df[col].isnull().sum()} ({(df[col].isnull().sum()/len(df)*100):.2f}%)")

# Use the function
show_detailed_categories(df, top_n=5)


Analysis for: url
--------------------------------------------------
Top 5 categories:
                                                    Count  Percentage  \
url                                                                     
https://wyoming.craigslist.org/ctd/d/conrad-201...      1         0.0   
https://wyoming.craigslist.org/ctd/d/conrad-201...      1         0.0   
https://wyoming.craigslist.org/ctd/d/conrad-201...      1         0.0   
https://wyoming.craigslist.org/ctd/d/conrad-201...      1         0.0   
https://wyoming.craigslist.org/ctd/d/billings-2...      1         0.0   

                                                    Cumulative_Percentage  
url                                                                        
https://wyoming.craigslist.org/ctd/d/conrad-201...                    0.0  
https://wyoming.craigslist.org/ctd/d/conrad-201...                    0.0  
https://wyoming.craigslist.org/ctd/d/conrad-201...                    0.0  
https://wyoming.crai