# CarGurus.com Scrape
- Notes: scraped September 2020. Good data. Description, major options, etc
- 3m obs, 66 vars
- https://www.kaggle.com/datasets/ananaymital/us-used-cars-dataset
- original filename used_cars_data.csv

### Other dataset-specific notebooks to review
- https://www.kaggle.com/code/alvarolozanoalonso/mileage-tfm-15-09-2021
- https://www.kaggle.com/code/fr3shk/vehicle-price-prediction-part-1
- https://www.kaggle.com/code/fr3shk/vehicle-price-prediction-part-2

## Environment

In [1]:
# versions

from platform import python_version
print('python',python_version())

from importlib.metadata import version
print('numpy',version('numpy'))
print('pandas',version('pandas'))
print('pandas_profiling',version('pandas_profiling'))

python 3.9.13
numpy 1.21.5
pandas 1.4.4
pandas_profiling 3.5.0


In [2]:
import random
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from uszipcode import SearchEngine # https://pypi.org/project/uszipcode/ / https://www.pythonpool.com/uszipcode-python/
import missingno as msno # https://github.com/ResidentMario/missingno

In [10]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 50)

## Data

### Load and Inspect

In [3]:
df = pd.read_parquet('data/cargurus_kaggle.parquet')

In [4]:
(df.memory_usage().sum() / 1024**2).round()

631.8689498901367

In [13]:
# drop duplicates by vin
print(df.shape)
df.drop_duplicates(subset = 'vin',inplace=True)
print(df.shape)

(3000040, 64)
(3000000, 64)


In [14]:
# drop unneeded obs
df = df[df['is_new'] == False]
print(df.shape)

(1528979, 64)


In [19]:
# drop columns unimportant to model
df.drop(columns=['bed_height' # all '--'
                ,'bed' # mostly na (98%) and redundant with bed_length anyway
                ,'listing_id'
                ,'main_picture_url'
                ,'savings_amount' # involves cargurus price projection
                ,'sp_id' # dealer id
                ,'sp_name' # dealer name
                ,'wheel_system_display' # redundant with wheel_system
                ] 
    ,inplace=True)
print(df.shape)

(1528979, 56)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['bed_height' # all '--'


In [20]:
# drop empty columns
df.dropna(axis = 'columns', how = 'all', inplace = True)
print(df.shape)

(1528979, 55)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis = 'columns', how = 'all', inplace = True)


In [21]:
# rename fields
df.rename(columns={'city_fuel_economy':'mpg_city'
                    ,'fuel_tank_volume':'fuel_capacity'
                    ,'highway_fuel_economy':'mpg_hwy'
                    ,'exterior_color':'color_exterior'
                    ,'interior_color':'color_interior'
                    ,'maximum_seating':'seats'
                    ,'year':'model_year'
                    ,'back_legroom':'legroom_back'
                    ,'front_legroom':'legroom_front'
                    ,'height':'dim_height'
                    ,'length':'dim_length'
                    ,'width':'dim_width'
                    ,'wheelbase':'dim_wheelbase'
                    ,'bed_length':'dim_bed'
                    ,'engine_displacement':'engine_disp'
                    ,'engine_cylinders':'engine_cyl'
                    ,'torque':'engine_torque'
                    ,'horsepower':'engine_hp'
                    ,'isCab':'is_taxi'
                    ,
                    }, inplace=True)

# sort columns
df = df.reindex(sorted(df.columns), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'city_fuel_economy':'mpg_city'


In [22]:
print(df.columns)

Index(['body_type', 'cabin', 'city', 'color_exterior', 'color_interior',
       'daysonmarket', 'dealer_zip', 'description', 'dim_bed', 'dim_height',
       'dim_length', 'dim_wheelbase', 'dim_width', 'engine_cyl', 'engine_disp',
       'engine_hp', 'engine_torque', 'engine_type', 'fleet', 'frame_damaged',
       'franchise_dealer', 'franchise_make', 'fuel_capacity', 'fuel_type',
       'has_accidents', 'is_cpo', 'is_new', 'is_oemcpo', 'is_taxi', 'latitude',
       'legroom_back', 'legroom_front', 'listed_date', 'listing_color',
       'longitude', 'major_options', 'make_name', 'mileage', 'model_name',
       'model_year', 'mpg_city', 'mpg_hwy', 'owner_count', 'power', 'price',
       'salvage', 'seats', 'seller_rating', 'theft_title', 'transmission',
       'transmission_display', 'trimId', 'trim_name', 'vin', 'wheel_system'],
      dtype='object')


In [23]:
df.sample(5).T

Unnamed: 0,1971040,366772,2676897,655819,1603068
body_type,Hatchback,SUV / Crossover,SUV / Crossover,Pickup Truck,Sedan
cabin,,,,,
city,Billings,Baltimore,Brownfield,North Canton,Decatur
color_exterior,CHARCOAL,Black,Summit White,GRAY,Celestial Silver Metallic
color_interior,Black,Jet Black,Shale,,
daysonmarket,16,1,49,18,127
dealer_zip,59101,21224,79316,44720,35601
description,"STOP BY PERFORMANCE AUTO SALES TODAY, YOUR BES...",TENT SALE EVENT ALL MONTH LONG . (125% OF KELL...,"Summary: **ONE OWNER**, **CLEAN AUTOCHECK**, A...",Family Owned and Operated We offer Easy financ...,"Local One Owner,Loaded,Navigation,Blindspot,Le..."
dim_bed,,,,76.3 in,
dim_height,58.4 in,66.3 in,69.9 in,74.7 in,56.9 in


In [24]:
# set VIN as index
df.set_index("vin", inplace=True)

In [25]:
# extract number from fields with units and other text
cols = ['legroom_back'
        ,'dim_bed'
        ,'legroom_front'
        ,'fuel_capacity'
        ,'dim_height'
        ,'dim_length'
        ,'seats'
        ,'dim_wheelbase'
        ,'dim_width'
        ]

for col in cols:
    first_ob = df[df[col].notna()][col].iloc[0]
    unit_loc = first_ob.find(' ') - len(first_ob)
    df[col] = df[col].str.slice(0,unit_loc).replace('',np.nan).astype(float)

# all dimensions are in inches


In [26]:
df[cols].dtypes

legroom_back     float64
dim_bed          float64
legroom_front    float64
fuel_capacity    float64
dim_height       float64
dim_length       float64
seats            float64
dim_wheelbase    float64
dim_width        float64
dtype: object

In [45]:
# extract engine cylinders
df['engine_cyl'] = df['engine_cyl'].str.replace(r'\D','',regex=True).astype(float)

In [51]:
df.memory_usage().sum() / 1024**2

341.8736515045166

In [52]:
df.to_parquet('data/cargurus_kaggle_clean.parquet')

---
---
---

In [19]:
# parse horsepower, torque, and rpms
# re.findall(r'\d+\,?\d*','355 hp @ 5,600 RPM')

In [None]:
# extract number of gears, transmission features
# df.groupby(['transmission','transmission_display']).size()

transmission  transmission_display              
A             1-Speed Automatic                       116
              10-Speed Automatic                        5
              2-Speed Automatic                         7
              3-Speed Automatic                        22
              4-Speed Automatic                      1069
              4-Speed Automatic Overdrive             284
              5-Speed Automatic                      1325
              5-Speed Automatic Overdrive             352
              6-Speed Automatic                      9788
              6-Speed Automatic Overdrive            1875
              7-Speed Automatic                      1019
              7-Speed Automatic Overdrive              35
              8-Speed Automatic                      4102
              8-Speed Automatic Overdrive             135
              9-Speed Automatic                      1387
              9-Speed Automatic Overdrive              39
              Automatic

In [38]:
# numeric
df.describe().round(decimals = 1).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
daysonmarket,50948.0,56.8,103.5,0.0,13.0,29.0,57.0,3573.0
dim_bed,6048.0,72.6,9.2,41.5,67.1,69.3,78.8,98.6
dim_height,48692.0,65.0,7.4,43.7,57.9,65.8,69.9,114.2
dim_length,48692.0,192.8,18.6,106.1,182.0,189.8,199.9,290.0
dim_wheelbase,48694.0,114.3,13.2,73.5,106.3,110.6,116.2,201.0
dim_width,48692.0,77.7,7.3,55.0,72.4,75.8,81.8,109.0
engine_cyl,49645.0,5.2,1.5,2.0,4.0,4.0,6.0,12.0
engine_disp,48830.0,3049.5,1307.9,700.0,2000.0,2500.0,3600.0,8400.0
engine_hp,48830.0,246.6,88.1,67.0,175.0,245.0,300.0,808.0
fuel_capacity,48669.0,18.7,5.2,1.9,15.3,18.0,21.1,52.0


In [39]:
# non-numeric
pd.set_option('display.max_colwidth', 25)
df.describe(include = ['object','category']).T

Unnamed: 0,count,unique,top,freq
bed,591,3,Short,367
body_type,50888,9,SUV / Crossover,21419
cabin,1308,4,Crew Cab,962
city,50948,3770,Houston,713
color_exterior,50948,3407,Black,3928
color_interior,50938,2490,Black,14313
dealer_zip,50948,6523,77477,198
description,50071,49525,All advertised prices...,15
engine_torque,44619,1295,"383 lb-ft @ 4,100 RPM",1520
engine_type,49645,31,I4,23174


In [27]:
# five example obs
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 500)

df.sample(3).T

vin,1J4GL58505W583249,5UXTS3C5XK0Z04391,1VWGT7A30HC072671
back_legroom,37.2,36.4,39.1
bed,,,
bed_length,,,
body_type,SUV / Crossover,SUV / Crossover,Sedan
cabin,,,
city,Hastings,Tuscaloosa,Brunswick
color_exterior,Bright Silver Metallic Clearcoat,Alpine White,PLATINUM GRAY METALLIC
color_interior,Brown (Dark Khaki/Light Graystone),Brown (Canberra Beige/Black),TITAN BLACK LEATHERETTE
daysonmarket,14,10,5
dealer_zip,68901,35405,44212


### Missing

In [28]:
# missing values
nulls = pd.DataFrame(data = {'count':df.isna().sum()})
nulls['pct'] = nulls['count'] / df.shape[0]
nulls.sort_values(by = 'count', ascending = False)

Unnamed: 0,count,pct
bed,50357,0.9884
cabin,49640,0.974327
is_oemcpo,46448,0.911675
is_cpo,44917,0.881624
bed_length,44900,0.881291
franchise_make,18962,0.372183
mpg_hwy,8157,0.160104
mpg_city,8157,0.160104
torque,6329,0.124225
power,5419,0.106363


In [None]:
# drop columns with > 80% missing values

# DON'T DROP BED LENGTH YET, CHECK IF NULL % FOR TRUCK ONLY

print(df.shape)
df.dropna(axis = 'columns', thresh = df.shape[0] * 0.8, inplace = True)
print(df.shape)

In [None]:
msno.heatmap(df)

In [None]:
# missingness correlations
temp = df.iloc[:, [i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0]]
corr_mat = temp.isnull().corr()

corr_mat2 = corr_mat.unstack().reset_index()
corr_mat2.columns = ['var1','var2','corr']
corr_mat2['var_min'] = corr_mat2[['var1','var2']].min(axis=1)
corr_mat2['var_max'] = corr_mat2[['var1','var2']].max(axis=1)
corr_mat2.drop(columns=['var1','var2'], inplace=True)
corr_mat2.drop_duplicates(inplace=True)
corr_mat2 = corr_mat2[corr_mat2['var_min'] != corr_mat2['var_max']]
corr_mat2 = corr_mat2[corr_mat2['corr'] > 0.1]
corr_mat2.sort_values(by='corr', ascending=False)

#### Mileage
Mileage is an essential feature expected to have a large impact on the model. Better to drop than impute.

In [None]:
print(df.shape)
print(df['mileage'].isna().sum())
df = df[df['mileage'].notna()]
print(df.shape)

Mileage is mostly missing on new vehicles

### Augmentation

#### States

In [None]:
# add state
df['state'] = np.nan

for i in df.index:
    search = SearchEngine()
    zipcode = search.by_zipcode(df.loc[i,'dealer_zip'])
    try:
        df.loc[i,'state'] = zipcode.state
    except:
        pass
    else:
        df.loc[i,'state'] = zipcode.state

print('state not found:',df['state'].isna().sum())

In [None]:
print((df['state'].value_counts()).round(decimals = 2).head(10))

In [None]:
print((df['state'].value_counts())[['TX','OH','VA']].round(decimals = 2))
print((df['state'].value_counts()/df.shape[0])[['TX','OH','VA']].round(decimals = 2))
print((df['state'].value_counts()/df.shape[0])[['TX','OH','VA']].sum().round(decimals = 2))
print(f'{0.18*3000000:,}')