# Test Join
We have enough to look at what we're going to have. Let's take a look!

In [2]:
import pandas as pd
import pyarrow as pa

In [3]:
# increase pandas display row limit
pd.set_option('display.max_rows', 100)

## Listings

In [20]:
mc = pd.read_parquet('part-00272-bbdecf64-de8c-4648-b96c-e44fb6cd17e1-c000.snappy.parquet', engine='pyarrow')
print(mc.shape)

(13819, 71)


For simplicity sake, I'm dropping additional registrations for the same VIN. In practice will need to do some kind of date precedent logic.

In [21]:
# deduplicate mc_tx by vin and keep the first record
mc = mc.drop_duplicates(subset='vin', keep='first')
print(mc.shape)


(13666, 71)


In [16]:
mc['state'].value_counts()

TX    8072
OH    3379
TN    2214
Name: state, dtype: int64

In [8]:
mc.sample(5).T

Unnamed: 0,5427,3594,8672,3492,5731
id,JM3KFBDM0J0392857-616741f4-f644,5TDYK3DC7FS545539-d4cc1f84-ad8e-418e-b164-8602...,WBA8E9C55GK646680-bf551e16-647e-44af-93d0-8f79...,5TDGZRBH5MS154153-5983ec16-b541,JN8AT2MT3HW399124-fa835b55-9439-46b2-8d0e-79d2...
vin,JM3KFBDM0J0392857,5TDYK3DC7FS545539,WBA8E9C55GK646680,5TDGZRBH5MS154153,JN8AT2MT3HW399124
heading,2018 Mazda CX-5 Grand Touring SUV,2015 TOYOTA SIENNA XLE PREM 8-PASS SUNROOF NAV...,2016 BMW 3 Series 328i Sedan,Certified Used 2021 Toyota Highlander XLE,Pre-Owned 2017 Nissan Rogue SV
price,22249.0,27730.0,20991.0,44669.0,20000.0
msrp,22249.0,,20991.0,44995.0,20000.0
miles,40931.0,26112.0,38538.0,11580.0,23156.0
stock_no,20750PR,545539,GK646680,B-356,NP2411
year,2018.0,2015.0,2016.0,2021.0,2017.0
make,Mazda,Toyota,BMW,Toyota,Nissan
model,CX-5,Sienna,3 Series,Highlander,Rogue


## Texas

In [6]:
tx = pd.read_parquet('~/Projects/used_car_pricing/data/tx_mvr/tx_mvr_out.parquet', engine='pyarrow')
print(tx.shape)

In [11]:
tx.sample(5).T

Unnamed: 0,14587549,6907488,26004201,23129461,2554148
VIN,3KPF24AD6KE094957,1FTFW1CF8CKD85461,1C6RR6TT2KS509317,1FTFW1CT4DFD10529,5TFJU4GN8DX048182
SALE_DATE,'2019-08-07','2018-10-23','2022-04-14','2021-08-06','2018-02-20'
SALES_PRICE,18535.0,10000.0,,,23682.929688
ODOMETER_BRAND,A,A,A,A,A
ODOMETER_READING,99,178314,73478,152225,65221
DOCNO,05731543682250485,15232643394104017,20820044663115943,10149444412075223,22125043149140930
VEHYEAR,2019,2012,2019,2013,2013
MAKE,KIA,FORD,RAM,FORD,TOYT
MODEL,FOR,F15,150,F15,TAC
BODY_TYPE,4D,PK,PK,PK,PK


In [18]:
# dedupe tx by vin and keep the first record
tx = tx.drop_duplicates(subset='VIN', keep='first')
print(tx.shape)

In [21]:
# join mc_tx and tx on vin = VIN
tx_match = mc.merge(tx, left_on='vin', right_on='VIN', how='inner')


In [23]:
# match rate
tx_match.shape[0] / mc[mc['state'] == 'TX'].shape[0]

0.8002973240832507

In [33]:
# save to parquet
tx_match.to_parquet('tx_match.parquet', engine='pyarrow')

## Ohio

In [6]:
oh = pd.read_parquet('~/Projects/used_car_pricing/data/oh_mvr/oh_mvr_out.parquet', engine='pyarrow')
print(oh.shape)

(28591523, 24)


In [7]:
oh.sample(5).T

Unnamed: 0,10095110,19745438,28092735,18340745,25077100
TitleNumber,2800818475,4806216486,906111381,3108875276,8100493555
IssueDate,2018-11-19 00:00:00,2022-01-11 00:00:00,2022-10-14 00:00:00,2020-02-06 00:00:00,2022-07-07 00:00:00
TitleStatus,ACTIVE,ACTIVE,INACTIVE,ACTIVE,INACTIVE
TitleType,ORIGINAL,ORIGINAL,ORIGINAL,ORIGINAL,ORIGINAL
VIN,1C4PJMCSXFW521538,1GR1A0621LW174726,1GCWGFFAXB1116313,1N4BA41E84C895433,1FT7W2B69KED22748
Year,2015,2020,2011,2004,2019
Make,JEEP,GREAT DANE TRAI,CHEVROLET,NISSAN,FORD
Model,CHEROKEE,---,--,MAXIMA,F250
BodyType,STATION WAGON,VAN,VAN,FOUR DOOR,PICKUP TRUCK
NewOrUsed,Used,Used,Used,Used,Used


Ohio is unique in that they require registration of dealer-to-dealer transactions. Here I'm dropping dealer-to-dealer because it's not a transaction we're interested in, and it's not comparable to the data we have from the other states.

In [11]:
# filter out rows where both OwnerType = 'DEALER' and PrevOwnerType = 'DEALER'
oh = oh[~((oh['OwnerType'] == 'DEALER') & (oh['PrevOwnerType'] == 'DEALER'))]
print(oh.shape)

(25325711, 24)


In [13]:
# dedupe by vin and keep the first record
oh = oh.drop_duplicates(subset='VIN', keep='first')
print(oh.shape)

(12837148, 24)


In [17]:
# join on vin
oh_match = mc.merge(oh, left_on='vin', right_on='VIN', how='inner')

In [18]:
# match rate
oh_match.shape[0] / mc[mc['state'] == 'OH'].shape[0]

0.9446581828943474

In [19]:
# save to parquet
oh_match.to_parquet('oh_match.parquet', engine='pyarrow')

## Tennessee

In [22]:
tn = pd.read_parquet('~/Projects/used_car_pricing/data/tn_mvr/tn_mvr.parquet', engine='pyarrow')
print(tn.shape)

(9831774, 13)


In [23]:
tn.sample(5).T

Unnamed: 0,2621566,6912672,2206334,681427,8250810
vin,1JJV532D1NL306341,3MZBN1U75JM190168,1J4GK48K46W282715,1C6RR6MT5FS629598,VBKEXG400LM293799
price,0.0,17821.0,0.0,0.0,0.0
odometer_type,1,0,1,0,0
mileage,0,4,0,68401,1
county,Davidson,Lawrence,State,State,Fentress
zip,37209,38469,37355,37043,38553
model_year,2022,2018,2006,2015,2020
make,WANC,MAZD,JEEP,RAM,KTM
model,DVC,MZ3,LBY,150,VBK
vehicle_type,FREIGHT/SEMI TRAILER,AUTO,AUTO,TRUCK,MOTORCYCLE


In [25]:
tn.columns = ['vin', 'price_tn', 'odometer_type_tn', 'mileage_tn', 'county_tn', 'zip_tn',
       'model_year_tn', 'make_tn', 'model_tn', 'vehicle_type_tn', 'new_used_tn',
       'title_issue_date_tn', 'purchase_date_tn']

In [26]:
# dedupe by vin and keep the first record
tn = tn.drop_duplicates(subset='vin', keep='first')
print(tn.shape)

(7558173, 13)


In [27]:
# join on vin
tn_match = mc.merge(tn, on='vin', how='inner')

In [28]:
# match rate
tn_match.shape[0] / mc[mc['state'] == 'TN'].shape[0]

0.6296296296296297

In [29]:
# save to parquet
tn_match.to_parquet('tn_match.parquet', engine='pyarrow')