# TX Motor Vehicle Registrations

## Environment

In [1]:
# packages
import pandas as pd

## Data Load

In [2]:
# texas mvr data

df = pd.read_parquet('data/tx_mvr/tx_mvr_out.parquet')

print(df.shape)
print('df obs: ',f"{df.shape[0]:,d}")

(29053057, 18)
df obs:  29,053,057


In [None]:
pd.to_datetime(df['SALE_DATE'],format='%Y-%m-%d')

In [None]:
# convert mvr_sale_date to datetime
tx['mvr_sale_date'] = pd.to_datetime(tx['mvr_sale_date'].str.replace("'", ""), format='%Y-%m-%d')

In [None]:
# set dates dtype
df['SALE_DATE'] = pd.to_datetime(df['SALE_DATE'],format='%Y-%m-%d')
df['TTL_SIGNED_DATE'] = pd.to_datetime(df['TTL_SIGNED_DATE'],format='%Y%m%d')


In [6]:
df.info(show_counts=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29053057 entries, 0 to 29053056
Data columns (total 18 columns):
 #   Column            Dtype   
---  ------            -----   
 0   VIN               object  
 1   SALE_DATE         object  
 2   SALES_PRICE       float32 
 3   ODOMETER_BRAND    category
 4   ODOMETER_READING  object  
 5   DOCNO             object  
 6   VEHYEAR           int16   
 7   MAKE              category
 8   MODEL             category
 9   BODY_TYPE         category
 10  OWNERSHIP_CD      category
 11  TTL_SIGNED_DATE   int64   
 12  OWNER_ZIP         category
 13  OWNER_ZIP+4       category
 14  RENEW_ZIP         category
 15  RENEW_ZIP+4       category
 16  RECONDITION_CD    category
 17  SALVSTATECNTRY    category
dtypes: category(11), float32(1), int16(1), int64(1), object(4)
memory usage: 1.8+ GB


## Basic Data Info

In [7]:
df.nunique()

VIN                 22726693
SALE_DATE               1487
SALES_PRICE          2604223
ODOMETER_BRAND             3
ODOMETER_READING      280750
DOCNO               29053057
VEHYEAR                  138
MAKE                   23254
MODEL                  21061
BODY_TYPE                445
OWNERSHIP_CD              28
TTL_SIGNED_DATE         9013
OWNER_ZIP              24157
OWNER_ZIP+4            10031
RENEW_ZIP               8003
RENEW_ZIP+4             9611
RECONDITION_CD             4
SALVSTATECNTRY           163
dtype: int64

In [20]:
# data sample
df.sample(5).T

Unnamed: 0,3571007,622989,3161969,6870996,4216081
VIN,1FAFP4048WF189346,1FTRX18L6XKA85447,WBA3B1G59FNT63405,1FTEW1CP8JKE65148,1GT49REY1NF312911
SALE_DATE,'2022-09-01','2018-02-22','2018-01-31','2018-11-26','2022-08-10'
SALES_PRICE,2200.0,1320.0,21000.0,38941.77,90859.95
ODOMETER_BRAND,,,A,A,A
ODOMETER_READING,EXEMPT,EXEMPT,21658,188,8
DOCNO,13325044803155511,14610143151152546,10130643129250101,1535443428250149,19930244781081750
VEHYEAR,1998,1999,2015,2018,2022
MAKE,FORD,FORD,BMW,FORD,GMC
MODEL,MUS,,320,F15,SIE
BODY_TYPE,CP,PK,4D,PK,PK


In [22]:
# numeric
df.describe().round(decimals = 1)

Unnamed: 0,SALES_PRICE,DOCNO,VEHYEAR,OWNERSHIP_CD,TTL_SIGNED_DATE,OWNER_ZIP+4,RENEW_ZIP,RENEW_ZIP+4,RECONDITION_CD
count,2546250.0,3319455.0,3319455.0,3243168.0,3319455.0,189105.0,145369.0,14242.0,216207.0
mean,19196.1,1.142992e+16,2010.9,4.3,20179027.9,4169.8,75998.6,4067.1,2.5
std,62560.9,7282293000000000.0,8.3,5.0,5007.3,2502.8,6479.5,2386.2,0.9
min,0.0,100043100000000.0,1882.0,1.0,19480317.0,-751.0,1702.0,-220.0,1.0
25%,3000.0,5744943000000000.0,2006.0,1.0,20180129.0,2114.0,75261.0,2210.0,2.0
50%,15355.7,1.013124e+16,2014.0,3.0,20180313.0,4141.0,77042.0,3929.5,2.0
75%,28686.0,1.783084e+16,2017.0,6.0,20180427.0,5852.0,78216.0,5667.0,4.0
max,85022151.0,2.973504e+16,2020.0,28.0,20180630.0,9999.0,99705.0,9800.0,4.0


In [23]:
# non-numeric
df.describe(include = ['object','category'])

Unnamed: 0,VIN,SALE_DATE,ODOMETER_BRAND,ODOMETER_READING,MAKE,MODEL,BODY_TYPE,OWNER_ZIP,SALVSTATECNTRY
count,3281777,3319455,1974988,3062742,3319455,2536368,3319453,3319455,59358
unique,3201901,155,3,182242,8744,11045,235,10680,83
top,1D7HA18N73J593074,'2018-04-04',A,EXEMPT,FORD,F15,4D,74134,LA
freq,4,31384,1953462,1087754,496104,78111,926699,44637,11072


In [None]:
# dates
df.describe(include = ['datetime'], datetime_is_numeric=True)

## Filter Used Autos and Trucks

In [24]:
# vehicle types
print(df['BODY_TYPE'].value_counts(dropna=False))

4D    926699
LL    888061
PK    665774
VN    110805
UT    110728
       ...  
SO         1
MX         1
W4         1
6D         1
U          1
Name: BODY_TYPE, Length: 236, dtype: int64


In [None]:
# filter to auto/truck only
print(f'all obs: {df.shape[0]:,d}')
df.drop(df[~df['vehicle_type'].isin(['AUTO','TRUCK'])].index, inplace=True)
print(f'auto/truck only obs: {df.shape[0]:,d}')

In [None]:
# new / used
print(df['new_used'].value_counts(dropna=False))

In [None]:
# filter to used only
print(f'all obs: {df.shape[0]:,d}')
df.drop(df[df['new_used'] == 'N'].index, inplace=True)
print(f'used only obs: {df.shape[0]:,d}')


## VINs

Duplicate VINs need to be worked on

In [None]:
print('obs: {0:,d}'.format(df.shape[0]))
print('unique vins: {0:,d}'.format(df['vin'].unique().shape[0]))

## Prices

Have got some price issues
- Lots of missing with 0 filler
- Very high values

In [None]:
num = df.shape[0]
print('total obs: ',f"{num:,d}")

num = (df['price'] < 0.1).sum()
print('0 prices: ',f"{num:,d}")

num = (df['price'] > 10000).sum()
print('prices > 10,000: ',f"{num:,d}")

num = ((df['price'] > 0.1) & (df['price'] < 10000)).sum()
print('prices between: ',f"{num:,d}")

In [None]:
# hist prices between $0.1 and $10,000
df[((df['price'] > 0.1) & (df['price'] < 10000))]['price'].hist()

### High Prices

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
df.sort_values(by='price', ascending=False).head(50).reset_index()

In [None]:
# between $1m and $10m
condition_low = df['price'] > 1000000
condition_high = df['price'] < 10000000
df[condition_low & condition_high]['price'].hist()

I suspect some counties are adding zeros for decimal

- 1m = $10,000.00
- 10m = $100,000.00

### Distributions by County

In [None]:
# mean prices by county

nums = df.groupby('county')['price'].mean().sort_values(ascending=False)
index = nums.index

for i in range(len(nums)):
    print("{}: ${:0,.2f}".format(index[i],nums[i]))

In [None]:
# median prices by county

nums = df.groupby('county')['price'].median().sort_values(ascending=False)
index = nums.index

for i in range(len(nums)):
    print("{}: ${:0,.2f}".format(index[i],nums[i]))

In [None]:
groups = df.groupby('county')

for name, group in groups:
    print(name)
    print(group['price'].describe().round(decimals = 1))

## The Rest

In [None]:
# null counts
pd.DataFrame({'count':df.isna().sum()
             ,'percent':(df.isna().sum()/df.shape[0]).round(decimals=2)}
    ).sort_values(by='count', ascending=False)

In [None]:
# 0 or filler counts