# Test Analysis

In [1]:
import pandas as pd
import pyarrow as pa

In [2]:
# increase pandas display row limit
pd.set_option('display.max_rows', 100)

In [3]:
mc = pd.read_parquet('part-00272-bbdecf64-de8c-4648-b96c-e44fb6cd17e1-c000.snappy.parquet', engine='pyarrow')
print(mc.shape)
mc = mc.drop_duplicates(subset='vin', keep='first')
print(mc.shape)

(13819, 71)
(13666, 71)


In [4]:
mc['state'].value_counts()

TX    8072
OH    3379
TN    2214
Name: state, dtype: int64

In [11]:
tx_match = pd.read_parquet('tx_match.parquet', engine='pyarrow')
print(tx_match.shape)
print(tx_match.shape[0] / mc[mc['state'] == 'TX'].shape[0])

(6460, 89)
0.8002973240832507


In [14]:
oh_match = pd.read_parquet('oh_match.parquet', engine='pyarrow')
print(oh_match.shape)
print(oh_match.shape[0] / mc[mc['state'] == 'OH'].shape[0])

(3192, 95)
0.9446581828943474


In [33]:
tn_match = pd.read_parquet('tn_match.parquet', engine='pyarrow')
print(tn_match.shape)
print(tn_match.shape[0] / mc[mc['state'] == 'TN'].shape[0])

(1394, 83)
0.6296296296296297


In [15]:
(tx_match.shape[0] + oh_match.shape[0] + tn_match.shape[0])/mc.shape[0]

0.8082833308941899

80% match rate. I'll take it!

## Quick and Dirty Feature Selection

In [16]:
mc.columns

Index(['id', 'vin', 'heading', 'price', 'msrp', 'miles', 'stock_no', 'year',
       'make', 'model', 'trim', 'vehicle_type', 'body_type', 'body_subtype',
       'drivetrain', 'fuel_type', 'engine', 'engine_block', 'engine_size',
       'engine_measure', 'engine_aspiration', 'transmission', 'speeds',
       'doors', 'cylinders', 'city_mpg', 'highway_mpg', 'interior_color',
       'exterior_color', 'base_exterior_color', 'base_interior_color',
       'is_certified', 'is_transfer', 'taxonomy_vin', 'scraped_at',
       'status_date', 'first_scraped_at', 'source', 'seller_name', 'city',
       'state', 'zip', 'latitude', 'longitude', 'dealer_type',
       'car_seller_name', 'car_city', 'car_state', 'car_zip', 'car_latitude',
       'car_longitude', 'seller_comments', 'dom', 'dom_180', 'dom_active',
       'currency_indicator', 'miles_indicator', 'carfax_1_owner',
       'carfax_clean_title', 'loan_term', 'loan_apr', 'l_down_pay', 'l_emi',
       'f_down_pay', 'f_down_pay_per', 'f_emi', 'lea

In [18]:
# calculate percent missing for each field in df
def percent_missing(df):
    # Total missing values
    mis_val = df.isnull().sum()
    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    # Print some summary information
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")
    # Return the dataframe with missing information
    return mis_val_table_ren_columns

In [20]:
percent_missing(mc)

Your selected dataframe has 71 columns.
There are 67 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
engine_aspiration,13666,100.0
engine_measure,13666,100.0
car_longitude,13666,100.0
car_latitude,13666,100.0
speeds,13666,100.0
l_emi,13639,99.8
lease_term,13632,99.8
l_down_pay,13351,97.7
f_down_pay_per,12942,94.7
car_zip,12893,94.3


In [None]:
excluded = ['id', 'heading','msrp','stock_no','interior_color',
       'exterior_color','is_transfer','taxonomy_vin','scraped_at','status_date', 'first_scraped_at','source', 'seller_name', 'city','car_seller_name', 'car_city', 'car_state', 'car_zip', 'car_latitude',
       'car_longitude','seller_comments', 'dom', 'dom_180', 'dom_active',
       'currency_indicator','miles_indicator', 'carfax_1_owner',
       'carfax_clean_title', 'loan_term', 'loan_apr', 'l_down_pay', 'l_emi',
       'f_down_pay', 'f_down_pay_per', 'f_emi', 'lease_term','listed_options', 'hvf_options'
       
       'engine_measure', 'engine_aspiration','speeds','body_subtype',]

In [21]:
chosen = ['vin', 'price',  'miles',  'year',
       'make', 'model', 'trim', 'vehicle_type', 'body_type', 
       'drivetrain', 'fuel_type', 'engine', 'engine_block', 'engine_size',
        'transmission', 
       'doors', 'cylinders', 'city_mpg', 'highway_mpg',  'base_exterior_color', 'base_interior_color',
       'is_certified','state', 'zip', 'latitude', 'longitude', 'dealer_type','photo_links_count', 'photo_main']

In [23]:
percent_missing(mc[chosen])

Your selected dataframe has 29 columns.
There are 27 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
is_certified,3349,24.5
base_interior_color,3191,23.3
price,1815,13.3
dealer_type,1720,12.6
base_exterior_color,1271,9.3
engine_block,882,6.5
cylinders,882,6.5
engine_size,868,6.4
city_mpg,664,4.9
highway_mpg,664,4.9


## Grab Chosen Features and Real Prices

In [28]:
tx_part = tx_match[chosen + ['SALES_PRICE','SALE_DATE']]
tx_part.columns = chosen + ['mvr_price', 'mvr_sale_date']

In [29]:
oh_part = oh_match[chosen + ['PurchasePrice','PurchaseDate']]
oh_part.columns = chosen + ['mvr_price', 'mvr_sale_date']

In [35]:
tn_part = tn_match[chosen + ['price','purchase_date_tn']]
tn_part.columns = chosen + ['mvr_price', 'mvr_sale_date']

In [37]:
# stack the three dataframes
df = pd.concat([tx_part, oh_part, tn_part], axis=0)
print(df.shape)

(11046, 31)


## Teach the Machine!

In [None]:
# test/train split
from sklearn.model_selection import train_test_split
train, test = train_test_split(mc[chosen], test_size=0.2, random_state=42)