# Used cars listings for US & Canada
- https://www.kaggle.com/datasets/rupeshraundal/marketcheck-automotive-data-us-canada
- 2.5m obs, 21 vars. All changes. 65k dealer websites / marketcheck.com
- us-dealers-used.csv

In [4]:
# packages
import numpy as np
import pandas as pd

In [7]:
# data
df = pd.read_csv('data/us-dealers-used.csv',low_memory=False)

## EDA

In [8]:
df.dtypes

id               object
vin              object
price           float64
miles           float64
stock_no         object
year            float64
make             object
model            object
trim             object
body_type        object
vehicle_type     object
drivetrain       object
transmission     object
fuel_type        object
engine_size     float64
engine_block     object
seller_name      object
street           object
city             object
state            object
zip              object
dtype: object

### Confirmed there are vins with multiple observations and changing prices

In [9]:
df['vin'].value_counts()[df['vin'].value_counts() == 5]

KNDPM3ACXL7644369    5
KNAGN4A76B5122965    5
1GNSKBKC8KR398196    5
3C4NJDBB1JT162247    5
4T1BF1FK8DU284879    5
                    ..
5TFAW5F15JX722668    5
1C4RJFJT1JC289602    5
3VW6T7AU4LM005115    5
3GNCJLSB7JL401826    5
SCBEC9ZA6EC090986    5
Name: vin, Length: 67807, dtype: int64

In [10]:
df[df['vin'] == '1GNSKBKC8KR398196']

Unnamed: 0,id,vin,price,miles,stock_no,year,make,model,trim,body_type,...,drivetrain,transmission,fuel_type,engine_size,engine_block,seller_name,street,city,state,zip
1590347,4896a507-95b3,1GNSKBKC8KR398196,50995.0,37941.0,M24970,2019.0,Chevrolet,Tahoe,LT,SUV,...,4WD,Automatic,Unleaded,5.3,V,m j mcguire company,520 3Rd Avenue Southwest,Rugby,ND,58368
3358313,88440f82-9422,1GNSKBKC8KR398196,50995.0,37941.0,M24970,2019.0,Chevrolet,Tahoe,LT,SUV,...,4WD,Automatic,Unleaded,5.3,V,don bessette hyundai,1715 North Broadway,Minot,ND,58703
3365982,72d58a40-2474,1GNSKBKC8KR398196,50995.0,37941.0,M24970,2019.0,Chevrolet,Tahoe,LT,SUV,...,4WD,Automatic,Unleaded,5.3,V,don bessette hyundai,1715 North Broadway,Minot,ND,58703
3367732,7177c060-f589,1GNSKBKC8KR398196,50995.0,37941.0,M24970,2019.0,Chevrolet,Tahoe,LT,SUV,...,4WD,Automatic,Unleaded,5.3,V,bessette motors inc,395 1St Street North,Carrington,ND,58421
3369506,8dcc3f81-38d9,1GNSKBKC8KR398196,50995.0,37941.0,M24970,2019.0,Chevrolet,Tahoe,LT,SUV,...,4WD,Automatic,Unleaded,5.3,V,m.j. mcguire co,520 3Rd Ave Sw,Rugby,ND,58368


### Get an idea of how many have multiple useful observations
- 5.7m obs
- 1.9m unique vins

In [11]:
print('df shape',df.shape)
print('unique vins',df['vin'].unique().shape)

df shape (7104304, 21)
unique vins (2387394,)


In [12]:
# vins with multiple obs
multi_obs_index = df['vin'].duplicated()

In [13]:
# vins with multiple obs
multi_obs_vins = df[multi_obs_index]['vin'].unique()

# number of vins with multiple obs
multi_obs_vins.shape

(1083488,)

In [14]:
# unique prices by vin (i.e. nunique > 1 is price change)
df[multi_obs_index].groupby('vin')['price'].nunique()

vin
1111111U5GW174556    1
12345657891111111    1
137FA57343E206288    1
137FA84321E194130    1
137FA84362E198019    1
                    ..
ZPBUA1ZLXKLA04410    1
ZPBUA1ZLXKLA04973    1
ZPBUA1ZLXMLA11845    1
ZPBUA1ZLXMLA12560    1
ZPBUA1ZLXMLA12705    1
Name: price, Length: 1083488, dtype: int64

In [15]:
# bool for vins with a price change
price_change_mask = df[multi_obs_index].groupby('vin')['price'].nunique() > 1

In [16]:
# vins with price changes
price_change_vins = multi_obs_vins[price_change_mask]
price_change_vins.shape

(80551,)

In [17]:
price_change_vins

array(['5YMKT6C54J0Y83619', 'WBA7U2C00LGJ59393', 'WP0AA2A99FS106664', ...,
       '1FT7W2BT3LEE03123', '1FT7W2BT3LED02423', '1FT7W2BT9KEC71757'],
      dtype=object)

In [18]:
price_change_vins_history = df[df['vin'].isin(price_change_vins)].sort_values(by = ['vin','id'])

In [19]:
# average number of price changers per vin with price changes
(price_change_vins_history.shape[0] - price_change_vins.shape[0]) / price_change_vins.shape[0]

4.154324589390573

In [20]:
price_change_vins_history[price_change_vins_history['vin'] == '5YMKT6C54J0Y83619']

Unnamed: 0,id,vin,price,miles,stock_no,year,make,model,trim,body_type,...,drivetrain,transmission,fuel_type,engine_size,engine_block,seller_name,street,city,state,zip
2865,07705da5-3ff9,5YMKT6C54J0Y83619,75996.0,23858.0,P2410,2018.0,BMW,X5 M,Base,SUV,...,4WD,Automatic,Premium Unleaded,4.4,V,honda of newnan,391 Newnan Crossing Bypass,Newnan,GA,30265
5362188,0a22b77c-e3ee,5YMKT6C54J0Y83619,75996.0,23858.0,P2410,2018.0,BMW,X5 M,Base,SUV,...,4WD,Automatic,Premium Unleaded,4.4,V,darrell waltrip buick gmc,1440 Murfreesboro Road,Franklin,TN,37067
34446,0df1e912-38cd,5YMKT6C54J0Y83619,75996.0,23858.0,P2410,2018.0,BMW,X5 M,Base,SUV,...,4WD,Automatic,Premium Unleaded,4.4,V,hendrick motors of charlotte,5201 E. Independence Blvd,Charlotte,NC,28212
1134,0e1e49b4-5778,5YMKT6C54J0Y83619,75996.0,23858.0,P2410,2018.0,BMW,X5 M,Base,SUV,...,4WD,Automatic,Premium Unleaded,4.4,V,east bay mini,4340 Rosewood Dr,Pleasanton,CA,94588
1803274,0f0e604a-6cdb,5YMKT6C54J0Y83619,75996.0,23858.0,P2410,2018.0,BMW,X5 M,Base,SUV,...,4WD,Automatic,Premium Unleaded,4.4,V,hendrick automotive group,6030 E Independence Blvd,Charlotte,NC,28212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31516,f83db5d5-a943,5YMKT6C54J0Y83619,75996.0,23858.0,P2410,2018.0,BMW,X5 M,Base,SUV,...,4WD,Automatic,Premium Unleaded,4.4,V,hendrick kia of concord,7550 Hendrick Auto Plaza Nw,Concord,NC,28027
1781270,f87ec3c0-4655,5YMKT6C54J0Y83619,75996.0,23858.0,P2410,2018.0,BMW,X5 M,Base,SUV,...,4WD,Automatic,Premium Unleaded,4.4,V,mall of georgia mazda,3546 Ga-20,Buford,GA,30519
1777394,fad31122-9530,5YMKT6C54J0Y83619,76387.0,23786.0,P2410,2018.0,BMW,X5 M,Base,SUV,...,4WD,Automatic,Premium Unleaded,4.4,V,rick hendrick jeep chrysler dodge ram fiat,8333 Rivers Ave,Charleston,SC,29406
3571277,faf5851d-7921,5YMKT6C54J0Y83619,75996.0,23858.0,P2410,2018.0,BMW,X5 M,Base,SUV,...,4WD,Automatic,Premium Unleaded,4.4,V,rick hendrick dodge chrysler jeep ram,1468 Savannah Hwy,Charleston,SC,29407


## Texas

In [21]:
sum(df['state'] == 'TX')

542569

In [22]:
texas = df[df['state'] == 'TX']

In [23]:
print('df shape',texas.shape)
print('unique vins',texas['vin'].unique().shape)

df shape (542569, 21)
unique vins (255667,)
