# Final Processing

ACCRE session info
- 8 cores
- 240GB memory
- Anaconda3/2021.05
- any arch

## Environment and Data

In [3]:
import pandas as pd
import matplotlib.pyplot as plt


In [28]:
all_cols = ['vin', 'price', 'miles', 'year', 'make', 'model', 'trim',
       'vehicle_type', 'body_type', 'body_subtype', 'drivetrain', 'fuel_type',
       'engine_block', 'engine_size', 'transmission', 'doors', 'cylinders',
       'city_mpg', 'highway_mpg', 'base_exterior_color', 'base_interior_color',
       'is_certified', 'is_transfer', 'scraped_at', 'status_date',
       'first_scraped_at', 'city', 'state', 'zip', 'latitude', 'longitude',
       'dealer_type', 'seller_comments', 'currency_indicator',
       'miles_indicator', 'photo_links_count', 'listed_options',
       'hvf_options']

read_cols = all_cols

In [8]:
import os

pickle_directory = "/data/p_dsi/capstone_projects/shea/2_deduped/TN/"
pickle_files = [file for file in os.listdir(pickle_directory) if file.endswith('.pickle')]

# read in each file
dataframes = []
for file in pickle_files:
    file_path = os.path.join(pickle_directory, file)
    df = pd.read_pickle(file_path)
    dataframes.append(df)

# concatenate
combined_dataframe = pd.concat(dataframes, ignore_index=True)


In [9]:
df = combined_dataframe
print(df.shape)

(3941813, 38)


In [10]:
# sort
df = df.sort_values(by = ["vin","status_date"]).reset_index()


In [12]:
df.columns

Index(['index', 'vin', 'price', 'miles', 'year', 'make', 'model', 'trim',
       'vehicle_type', 'body_type', 'body_subtype', 'drivetrain', 'fuel_type',
       'engine_block', 'engine_size', 'transmission', 'doors', 'cylinders',
       'city_mpg', 'highway_mpg', 'base_exterior_color', 'base_interior_color',
       'is_certified', 'is_transfer', 'scraped_at', 'status_date',
       'first_scraped_at', 'city', 'zip', 'latitude', 'longitude',
       'dealer_type', 'seller_comments', 'currency_indicator',
       'miles_indicator', 'photo_links_count', 'listed_options',
       'hvf_standard', 'hvf_optional'],
      dtype='object')

### Alternative

In [None]:
# pip install dask[distributed] dask[dataframe]

In [None]:
import os
import dask.dataframe as dd
from dask.delayed import delayed

# Set the directory containing the pickle files
pickle_directory = "path/to/your/pickle_directory"

# List all the pickle files in the directory
pickle_files = [file for file in os.listdir(pickle_directory) if file.endswith('.pkl')]

# Define a delayed function to read pickle files
@delayed
def read_pickle(file):
    return pd.read_pickle(file)

# Read each pickle file using the delayed function
dataframes = [read_pickle(os.path.join(pickle_directory, file)) for file in pickle_files]

# Concatenate all the delayed DataFrames into a single Dask DataFrame
combined_dask_dataframe = dd.from_delayed(dataframes)

# Optionally, convert the Dask DataFrame to a Pandas DataFrame (if it fits in memory)
# combined_dataframe = combined_dask_dataframe.compute()


## Duplicate Handling

In [13]:
# unique vin count
df["vin"].nunique()


3319340

In [14]:
# boolean indicating whether a row is a duplicate or not
# note: first occurrence not counted as deplicate
duplicates = df.duplicated(subset='vin', keep='first')

# count duplicate rows
num_duplicates = duplicates.sum()

print(f'The number of duplicate vehicle listings is: {num_duplicates}')


The number of duplicate vehicle listings is: 622473


In [15]:
# record count for each vin
vin_counts = df.groupby('vin').size()

# vins with multiple
duplicate_vin_counts = vin_counts[vin_counts > 1]

# duplicate vin list
duplicate_vins = duplicate_vin_counts.index.tolist()


In [20]:
mask = df['status_date'] == df.groupby('vin')['status_date'].transform(max)
df = df.loc[mask]
print(df.shape)

(3320576, 39)


## Variable Transformations

In [21]:
# replace listed_options ["None"] with None
df.loc[df["listed_options"].apply(lambda x: x[0]) == "None","listed_options"] = None


In [22]:
# replace hvf_standard and hvf_options [] with None
df.loc[df["hvf_standard"].str.len() == 0,"hvf_standard"] = None
df.loc[df["hvf_optional"].str.len() == 0,"hvf_optional"] = None


## Write Out

In [23]:
df.sample(5).T

Unnamed: 0,3105148,1689995,2083549,628341,641795
index,1980373,2105391,3594589,2418128,1429921
vin,JF2SJADC1FH814783,2G1WB5E39C1156432,3GCPCTEC9GG253933,1FTEX1CM8BFB75246,1FTFW1E81MFB47010
price,15900.0,7871.0,28400.0,,67692.0
miles,63480.0,105470.0,105385.0,70138.0,12763.0
year,2015.0,2012.0,2016.0,2011.0,2021.0
make,Subaru,Chevrolet,Chevrolet,Ford,Ford
model,Forester,Impala,Silverado 1500,F-150,F-150
trim,i Premium,LT,High Country,XLT,Platinum
vehicle_type,Truck,Car,Truck,Truck,Truck
body_type,SUV,Sedan,Pickup,Pickup,Pickup


In [31]:
# split away unstructured features
unstructured_cols = ["vin","status_date","seller_comments","listed_options"]
structured_cols = df.columns.to_list()



In [35]:
structured_cols.remove("seller_comments")
structured_cols.remove("listed_options")

In [38]:
df.columns.to_list()

['index',
 'vin',
 'price',
 'miles',
 'year',
 'make',
 'model',
 'trim',
 'vehicle_type',
 'body_type',
 'body_subtype',
 'drivetrain',
 'fuel_type',
 'engine_block',
 'engine_size',
 'transmission',
 'doors',
 'cylinders',
 'city_mpg',
 'highway_mpg',
 'base_exterior_color',
 'base_interior_color',
 'is_certified',
 'is_transfer',
 'scraped_at',
 'status_date',
 'first_scraped_at',
 'city',
 'zip',
 'latitude',
 'longitude',
 'dealer_type',
 'seller_comments',
 'currency_indicator',
 'miles_indicator',
 'photo_links_count',
 'listed_options',
 'hvf_standard',
 'hvf_optional']

In [37]:
# save separately
output_dir = "/data/p_dsi/capstone_projects/shea/3_final/"
df[unstructured_cols].to_pickle(output_dir + "dataset_unstructured.pkl")
df[structured_cols].to_pickle(output_dir + "dataset_structured.pkl")
