In [1]:
import pandas as pd
import pyarrow
import fastparquet as fp
from glob import glob

In [2]:
def print_shape(df):
    print(f"Rows: {df.shape[0]:,} \nColumns: {df.shape[1]:,}")

In [3]:
def keep_latest(df, dedupe_by_cols, sort_by_cols):
    """Keep last record for each unique combination of dupe_cols, ordering by sort_cols"""
    df = df.sort_values(sort_by_cols).drop_duplicates(dedupe_by_cols, keep="last")
    return df

## Listings

In [4]:
# listing files
listings_dir = "/data/p_dsi/capstone_projects/shea/3_final/"
file_pattern = "*_dataset_structured.pkl"

In [5]:
listings_files = glob(listings_dir + file_pattern)

In [6]:
# read in listings
dataframes = []
for file in listings_files:
    df = pd.read_pickle(file)
    dataframes.append(df)

listings = pd.concat(dataframes)
print_shape(listings)

Rows: 18,333,814 
Columns: 36


In [7]:
listings.columns

Index(['vin', 'price', 'miles', 'year', 'make', 'model', 'trim',
       'vehicle_type', 'body_type', 'body_subtype', 'drivetrain', 'fuel_type',
       'engine_block', 'engine_size', 'transmission', 'doors', 'cylinders',
       'city_mpg', 'highway_mpg', 'base_exterior_color', 'base_interior_color',
       'is_certified', 'is_transfer', 'scraped_at', 'status_date',
       'first_scraped_at', 'city', 'zip', 'latitude', 'longitude',
       'dealer_type', 'currency_indicator', 'miles_indicator',
       'photo_links_count', 'hvf_standard', 'hvf_optional'],
      dtype='object')

## Registrations

In [8]:
# registrations path
registration_dir = "/data/p_dsi/capstone_projects/shea/registrations/"

### Texas

In [9]:
# texas
tx_file = registration_dir + "tx_mvr_out.parquet"

registrations_tx = fp.ParquetFile(tx_file).to_pandas(
    columns=["VIN", "SALE_DATE", "SALES_PRICE","MAKE","ODOMETER_READING","VEHYEAR"]
)

# rename columns
registrations_tx = registrations_tx.rename(columns={'VIN':'vin'
                                                    ,'SALE_DATE':'mvr_purchase_date'
                                                    ,'SALES_PRICE':'mvr_price'
                                                    ,'MAKE':'mvr_make'
                                                    ,'ODOMETER_READING':'mvr_mileage'
                                                    ,'VEHYEAR':'mvr_model_year'
                                                   })

# convert to date type
registrations_tx['mvr_purchase_date'] = pd.to_datetime(registrations_tx['mvr_purchase_date'],format="\'%Y-%m-%d\'")


# dedupe vin by date
registrations_tx = keep_latest(registrations_tx, ["vin"], ["mvr_purchase_date"])

# add state
registrations_tx["mvr_state"] = "TX"

# final
print_shape(registrations_tx)
registrations_tx.dtypes

Rows: 22,726,694 
Columns: 7


vin                          object
mvr_purchase_date    datetime64[ns]
mvr_price                   float32
mvr_make                   category
mvr_mileage                  object
mvr_model_year                int16
mvr_state                    object
dtype: object

### Ohio

In [10]:
# ohio
oh_file = registration_dir + "oh_mvr_out.parquet"

registrations_oh = fp.ParquetFile(oh_file).to_pandas(
    columns=['VIN','PurchaseDate','PurchasePrice','Make','Mileage','Year']
)

# rename columns
registrations_oh = registrations_oh.rename(columns={'VIN':'vin'
                                                    ,'PurchaseDate':'mvr_purchase_date'
                                                    ,'PurchasePrice':'mvr_price'
                                                    ,'Make':'mvr_make'
                                                    ,'Mileage':'mvr_mileage'
                                                    ,'Year':'mvr_model_year'
                                                   })

# convert to date type
registrations_oh['mvr_purchase_date'] = pd.to_datetime(registrations_oh['mvr_purchase_date'],format="%Y-%m-%d")

# dedupe vin by date
registrations_oh = keep_latest(registrations_oh, ["vin"], ["mvr_purchase_date"])

# add state
registrations_oh["mvr_state"] = "OH"

# final
print_shape(registrations_oh)
registrations_oh.dtypes

Rows: 13,422,774 
Columns: 7


vin                          object
mvr_purchase_date    datetime64[ns]
mvr_price                   float32
mvr_make                   category
mvr_mileage                 float32
mvr_model_year                int16
mvr_state                    object
dtype: object

### Tennessee

In [11]:
tn_file = registration_dir + "tn_mvr.parquet"

registrations_tn = fp.ParquetFile(tn_file).to_pandas(
    columns=['vin','purchase_date','price','make','mileage','model_year']
)

# rename columns
registrations_tn = registrations_tn.rename(columns={'vin':'vin'
                                                    ,'purchase_date':'mvr_purchase_date'
                                                    ,'price':'mvr_price'
                                                    ,'make':'mvr_make'
                                                    ,'mileage':'mvr_mileage'
                                                    ,'model_year':'mvr_model_year'
                                                   })

# convert to date type
registrations_tn['mvr_purchase_date'] = pd.to_datetime(registrations_tn['mvr_purchase_date'],format="%Y-%m-%d", errors="coerce")
registrations_tn['mvr_model_year'] = registrations_tn['mvr_model_year'].astype('int16')

# dedupe vin by date
registrations_tn = keep_latest(registrations_tn, ["vin"], ["mvr_purchase_date"])

# add state
registrations_tn["mvr_state"] = "TN"

# final
print_shape(registrations_tn)
registrations_tn.dtypes

Rows: 7,558,173 
Columns: 7


vin                          object
mvr_purchase_date    datetime64[ns]
mvr_price                   float32
mvr_make                     object
mvr_mileage                   Int32
mvr_model_year                int16
mvr_state                    object
dtype: object

### Stack

In [12]:
# stack all registrations
registrations = pd.concat([registrations_tx, registrations_oh, registrations_tn])
print_shape(registrations)
registrations.dtypes

Rows: 43,707,641 
Columns: 7


vin                          object
mvr_purchase_date    datetime64[ns]
mvr_price                   float32
mvr_make                     object
mvr_mileage                  object
mvr_model_year                int16
mvr_state                    object
dtype: object

## Merge

In [13]:
print_shape(listings)

Rows: 18,333,814 
Columns: 36


In [14]:
print_shape(registrations)

Rows: 43,707,641 
Columns: 7


In [15]:
listings.columns

Index(['vin', 'price', 'miles', 'year', 'make', 'model', 'trim',
       'vehicle_type', 'body_type', 'body_subtype', 'drivetrain', 'fuel_type',
       'engine_block', 'engine_size', 'transmission', 'doors', 'cylinders',
       'city_mpg', 'highway_mpg', 'base_exterior_color', 'base_interior_color',
       'is_certified', 'is_transfer', 'scraped_at', 'status_date',
       'first_scraped_at', 'city', 'zip', 'latitude', 'longitude',
       'dealer_type', 'currency_indicator', 'miles_indicator',
       'photo_links_count', 'hvf_standard', 'hvf_optional'],
      dtype='object')

In [16]:
registrations.columns

Index(['vin', 'mvr_purchase_date', 'mvr_price', 'mvr_make', 'mvr_mileage',
       'mvr_model_year', 'mvr_state'],
      dtype='object')

In [17]:
# merge with listings
df = listings.merge(registrations, on='vin', how='inner')
print_shape(df)

Rows: 13,074,852 
Columns: 42


In [18]:
# listings match rate
round(df.shape[0]/listings.shape[0],2)

0.71

In [19]:
# registrations match rate
round(df.shape[0]/registrations.shape[0],2)

0.3

In [22]:
df.dtypes

vin                            object
price                         float64
miles                         float64
year                          float64
make                           object
model                          object
trim                           object
vehicle_type                   object
body_type                      object
body_subtype                   object
drivetrain                     object
fuel_type                      object
engine_block                   object
engine_size                    object
transmission                   object
doors                         float64
cylinders                     float64
city_mpg                      float64
highway_mpg                   float64
base_exterior_color            object
base_interior_color            object
is_certified                  float64
is_transfer                   float64
scraped_at                     object
status_date                    object
first_scraped_at               object
city        

In [23]:
# write to pickle
output_dir = "/data/p_dsi/capstone_projects/shea/4_merged/"
df.to_pickle(output_dir + "merged_structured.pkl")