# TX Motor Vehicle Registrations

## Environment

In [7]:
# packages
import pandas as pd

## Data Load

In [18]:
# texas mvr data

df = pd.read_csv('data/tx_mvr/PIR - 296985-1.zip', compression='zip', low_memory=False, on_bad_lines = 'skip')

print(df.shape)
print('df obs: ',f"{df.shape[0]:,d}")

(3319455, 18)
df obs:  3,319,455


In [None]:
# set dates dtype
df['SALE_DATE'] = pd.to_datetime(df['SALE_DATE'],format='%Y-%m-%d')
df['TTL_SIGNED_DATE'] = pd.to_datetime(df['TTL_SIGNED_DATE'],format='%Y%m%d')


In [16]:
df.info(show_counts=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3319455 entries, 0 to 3319454
Data columns (total 18 columns):
 #   Column            Dtype         
---  ------            -----         
 0   VIN               object        
 1   SALE_DATE         datetime64[ns]
 2   SALES_PRICE       float64       
 3   ODOMETER_BRAND    object        
 4   ODOMETER_READING  object        
 5   DOCNO             int64         
 6   VEHYEAR           int64         
 7   MAKE              object        
 8   MODEL             object        
 9   BODY_TYPE         object        
 10  OWNERSHIP_CD      float64       
 11  TTL_SIGNED_DATE   int64         
 12  OWNER_ZIP         object        
 13  OWNER_ZIP+4       float64       
 14  RENEW_ZIP         float64       
 15  RENEW_ZIP+4       float64       
 16  RECONDITION_CD    float64       
 17  SALVSTATECNTRY    object        
dtypes: datetime64[ns](1), float64(6), int64(3), object(8)
memory usage: 455.9+ MB


## Basic Data Info

In [10]:
df.nunique()

VIN                 3201901
SALE_DATE               155
SALES_PRICE          478552
ODOMETER_BRAND            3
ODOMETER_READING     182242
DOCNO               3319455
VEHYEAR                 119
MAKE                   8744
MODEL                 11045
BODY_TYPE               235
OWNERSHIP_CD             25
TTL_SIGNED_DATE        3579
OWNER_ZIP             10680
OWNER_ZIP+4            9874
RENEW_ZIP              3221
RENEW_ZIP+4            6099
RECONDITION_CD            4
SALVSTATECNTRY           83
dtype: int64

In [21]:
# data sample
df.sample(5).T

Unnamed: 0,1211736,1723784,1614003,1091959,290963
VIN,1J8GS48K97C572658,2T3ZFREV5JW446916,2GKFLUE5XC6391607,1HD1CZ318BC441766,1FAFP56U16A136207
SALE_DATE,'2018-02-23','2018-03-13','2018-06-11','2018-03-22','2018-04-11'
SALES_PRICE,,24840.52,,3960.0,1000.0
ODOMETER_BRAND,,A,N,A,
ODOMETER_READING,EXEMPT,4,73879,1466,EXEMPT
DOCNO,10160643152090556,10131243170250078,10820743260141046,330143179155809,8420243199111823
VEHYEAR,2007,2018,2012,2011,2006
MAKE,JEEP,TOYT,GMC,HD,FORD
MODEL,GC,RAV,TER,,TAU
BODY_TYPE,LL,LL,LL,MC,4D


In [22]:
# numeric
df.describe().round(decimals = 1)

Unnamed: 0,SALES_PRICE,DOCNO,VEHYEAR,OWNERSHIP_CD,TTL_SIGNED_DATE,OWNER_ZIP+4,RENEW_ZIP,RENEW_ZIP+4,RECONDITION_CD
count,2546250.0,3319455.0,3319455.0,3243168.0,3319455.0,189105.0,145369.0,14242.0,216207.0
mean,19196.1,1.142992e+16,2010.9,4.3,20179027.9,4169.8,75998.6,4067.1,2.5
std,62560.9,7282293000000000.0,8.3,5.0,5007.3,2502.8,6479.5,2386.2,0.9
min,0.0,100043100000000.0,1882.0,1.0,19480317.0,-751.0,1702.0,-220.0,1.0
25%,3000.0,5744943000000000.0,2006.0,1.0,20180129.0,2114.0,75261.0,2210.0,2.0
50%,15355.7,1.013124e+16,2014.0,3.0,20180313.0,4141.0,77042.0,3929.5,2.0
75%,28686.0,1.783084e+16,2017.0,6.0,20180427.0,5852.0,78216.0,5667.0,4.0
max,85022151.0,2.973504e+16,2020.0,28.0,20180630.0,9999.0,99705.0,9800.0,4.0


In [23]:
# non-numeric
df.describe(include = ['object','category'])

Unnamed: 0,VIN,SALE_DATE,ODOMETER_BRAND,ODOMETER_READING,MAKE,MODEL,BODY_TYPE,OWNER_ZIP,SALVSTATECNTRY
count,3281777,3319455,1974988,3062742,3319455,2536368,3319453,3319455,59358
unique,3201901,155,3,182242,8744,11045,235,10680,83
top,1D7HA18N73J593074,'2018-04-04',A,EXEMPT,FORD,F15,4D,74134,LA
freq,4,31384,1953462,1087754,496104,78111,926699,44637,11072


In [None]:
# dates
df.describe(include = ['datetime'], datetime_is_numeric=True)

## Filter Used Autos and Trucks

In [24]:
# vehicle types
print(df['BODY_TYPE'].value_counts(dropna=False))

4D    926699
LL    888061
PK    665774
VN    110805
UT    110728
       ...  
SO         1
MX         1
W4         1
6D         1
U          1
Name: BODY_TYPE, Length: 236, dtype: int64


In [None]:
# filter to auto/truck only
print(f'all obs: {df.shape[0]:,d}')
df.drop(df[~df['vehicle_type'].isin(['AUTO','TRUCK'])].index, inplace=True)
print(f'auto/truck only obs: {df.shape[0]:,d}')

In [None]:
# new / used
print(df['new_used'].value_counts(dropna=False))

In [None]:
# filter to used only
print(f'all obs: {df.shape[0]:,d}')
df.drop(df[df['new_used'] == 'N'].index, inplace=True)
print(f'used only obs: {df.shape[0]:,d}')


## VINs

Duplicate VINs need to be worked on

In [None]:
print('obs: {0:,d}'.format(df.shape[0]))
print('unique vins: {0:,d}'.format(df['vin'].unique().shape[0]))

## Join with Other Datasets

### CarGurus Scrape

In [None]:
import pandas as pd
iter_csv = pd.read_csv('data/cargurus.csv', iterator=True, chunksize=1000)
df = pd.concat([chunk[chunk['field'] > constant] for chunk in iter_csv])

### MarketCheck

In [None]:
# marketcheck listings data
mc = pd.read_csv('data/marketcheck_kaggle.csv'
                ,usecols=['vin','year','state','make','model','miles','price']
                ,dtype={'vin':'string','year':'Int16','state':'category','make':'category','model':'string','miles':'Int32','price':'Int32'}
                ,low_memory=False)

In [None]:
mc.info()

In [None]:
mc.nunique()

In [None]:
df2 = df.join(mc, on='vin', how='left', rsuffix='_mc')

In [None]:
df2[~df2['vin_mc'].isna()]

## Prices

Have got some price issues
- Lots of missing with 0 filler
- Very high values

In [None]:
num = df.shape[0]
print('total obs: ',f"{num:,d}")

num = (df['price'] < 0.1).sum()
print('0 prices: ',f"{num:,d}")

num = (df['price'] > 10000).sum()
print('prices > 10,000: ',f"{num:,d}")

num = ((df['price'] > 0.1) & (df['price'] < 10000)).sum()
print('prices between: ',f"{num:,d}")

In [None]:
# hist prices between $0.1 and $10,000
df[((df['price'] > 0.1) & (df['price'] < 10000))]['price'].hist()

### High Prices

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
df.sort_values(by='price', ascending=False).head(50).reset_index()

In [None]:
# between $1m and $10m
condition_low = df['price'] > 1000000
condition_high = df['price'] < 10000000
df[condition_low & condition_high]['price'].hist()

I suspect some counties are adding zeros for decimal

- 1m = $10,000.00
- 10m = $100,000.00

### Distributions by County

In [None]:
# mean prices by county

nums = df.groupby('county')['price'].mean().sort_values(ascending=False)
index = nums.index

for i in range(len(nums)):
    print("{}: ${:0,.2f}".format(index[i],nums[i]))

In [None]:
# median prices by county

nums = df.groupby('county')['price'].median().sort_values(ascending=False)
index = nums.index

for i in range(len(nums)):
    print("{}: ${:0,.2f}".format(index[i],nums[i]))

In [None]:
groups = df.groupby('county')

for name, group in groups:
    print(name)
    print(group['price'].describe().round(decimals = 1))

## The Rest

In [None]:
# null counts
pd.DataFrame({'count':df.isna().sum()
             ,'percent':(df.isna().sum()/df.shape[0]).round(decimals=2)}
    ).sort_values(by='count', ascending=False)

In [None]:
# 0 or filler counts