# CarMax Analytics Showcase
- Competition: https://analyticsshowcase.carmax.com
- Data: https://github.com/kmx-analytics-showcase/Winter-2023

In [1]:
# environment
from platform import python_version
print('python',python_version())

# package versions
from importlib.metadata import version
print('numpy',version('numpy'))
print('pandas',version('pandas'))

python 3.9.13
numpy 1.21.5
pandas 1.4.4


In [2]:
# load packages
import numpy as np
import pandas as pd

## Data Load / Clean / Prep

In [3]:
file = 'data/data_winter2023.csv'
df = pd.read_csv(file)
df.shape

(200000, 30)

In [4]:
# rename fields
df.rename(columns={'trim_descrip':'trim'
                  ,'trim_descrip_appraisal':'trim_appraisal'
                  ,'appraisal_offer':'price_appraisal'
                  }, inplace=True)

# sort columns
df = df.reindex(sorted(df.columns), axis=1)

In [5]:
# correct datatypes

# make prices categorical with order
from pandas.api.types import CategoricalDtype
appraisal_cat = CategoricalDtype(
    ['$0k to $5k', '$5k to $10k', '$10k to $15k', '$15k to $20k', 
    '$20k to $25k', '$25k to $30k', '$30k to $35k', '$35k to $40k', 
    '$40k+'], 
    ordered=True
)
price_cat = CategoricalDtype(
    ['$0 to $15k', '$15k to $20k', '$20k to $25k', '$25k to $30k',
       '$30k to $35k', '$35k to $40k', '$40k to $45k', '$45k to $50k',
       '$50k to $55k', '$55k to $60k', '$60k to $65k', '$65k to $70k',
       '$70k+'], 
    ordered=True
)
df['price_appraisal'] = df['price_appraisal'].astype(appraisal_cat)
df['price'] = df['price'].astype(price_cat)

# make market categorical
df['market'] = df['market'].astype('category')

# make years int
df['model_year'] = df['model_year'].astype('int')
df['model_year_appraisal'] = df['model_year_appraisal'].astype('int')

# make body categorical
#df['body_appraisal'] = df['body_appraisal'].astype('category')
#df['body_appraisal'] = df['body_appraisal'].astype('category')

In [6]:
df.info(show_counts=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 30 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   body                     object  
 1   body_appraisal           object  
 2   color                    object  
 3   color_appraisal          object  
 4   cylinders                int64   
 5   cylinders_appraisal      int64   
 6   engine                   object  
 7   engine_appraisal         object  
 8   fuel_capacity            float64 
 9   fuel_capacity_appraisal  float64 
 10  horsepower               int64   
 11  horsepower_appraisal     int64   
 12  make                     object  
 13  make_appraisal           object  
 14  market                   category
 15  mileage                  object  
 16  mileage_appraisal        object  
 17  model                    object  
 18  model_appraisal          object  
 19  model_year               int64   
 20  model_year_appraisal     i

In [7]:
# numeric
df.describe().round(decimals = 1).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cylinders,200000.0,5.0,1.4,0.0,4.0,4.0,6.0,16.0
cylinders_appraisal,200000.0,5.1,1.3,3.0,4.0,4.0,6.0,10.0
fuel_capacity,199996.0,17.6,4.3,1.0,14.0,17.0,19.0,48.0
fuel_capacity_appraisal,199688.0,17.6,4.4,7.0,14.0,17.0,20.0,48.0
horsepower,200000.0,237.0,76.6,11.0,175.0,228.0,290.0,760.0
horsepower_appraisal,200000.0,219.7,73.6,11.0,168.0,197.0,278.0,760.0
model_year,200000.0,2012.6,2.1,1991.0,2012.0,2013.0,2014.0,2017.0
model_year_appraisal,200000.0,2007.8,4.9,1986.0,2005.0,2009.0,2012.0,2017.0
mpg_city,199410.0,22.2,5.2,11.0,18.0,22.0,26.0,60.0
mpg_city_appraisal,199999.0,21.7,5.5,9.0,18.0,21.0,25.0,60.0


In [8]:
# non-numeric
df.describe(include = ['object','category']).T

Unnamed: 0,count,unique,top,freq
body,189741,6,E,106730
body_appraisal,200000,8,E,77597
color,199967,16,White,45198
color_appraisal,198682,16,Black,41534
engine,200000,49,2.0L,36654
engine_appraisal,200000,56,2.4L,27269
make,200000,37,AIH,26201
make_appraisal,200000,38,KQZ,21886
market,200000,16,14,19729
mileage,200000,21,25k to 30k miles,18740


In [9]:
# data sample
df.sample(5).T

Unnamed: 0,91599,68741,59168,132312,171311
body,E,D,E,D,E
body_appraisal,C,D,E,F,E
color,Silver,Gray,Gray,Gray,Gray
color_appraisal,Black,Gray,Brown,Black,Gray
cylinders,6,8,6,6,4
cylinders_appraisal,4,8,4,4,4
engine,3.5L,5.7L,3.5L,3.3L,2.5L
engine_appraisal,1.6L,5.7L,2.4L,1.8L,1.6L
fuel_capacity,19.0,26.0,18.0,26.0,15.0
fuel_capacity_appraisal,12.0,26.0,14.0,13.0,16.0


### Validation

In [None]:
# would think these counts would stop increasing as a make/model/trim has same features
print(len(df[['make','model']].value_counts(dropna=False)))
print(len(df[['make','model','trim']].value_counts(dropna=False)))
print(len(df[['make','model','trim','body']].value_counts(dropna=False)))
print(len(df[['make','model','trim','body','engine']].value_counts(dropna=False)))
print(len(df[['make','model','trim','body','engine','cylinders']].value_counts(dropna=False)))
print(len(df[['make','model','trim','body','engine','cylinders','horsepower']].value_counts(dropna=False)))
print(len(df[['make','model','trim','body','engine','cylinders','horsepower','fuel_capacity']].value_counts(dropna=False)))

575
1068
1222
1814
2767
3629
4020


### Missingness
- Analyze missing/null values
- Impute values where appropriate
- Drop observations where appropriate

In [10]:
# null counts
pd.DataFrame({'count':df.isna().sum()
             ,'percent':(df.isna().sum()/df.shape[0]).round(decimals=2)}
    ).sort_values(by='count', ascending=False)

Unnamed: 0,count,percent
trim_appraisal,31080,0.16
trim,23841,0.12
model_appraisal,21508,0.11
body,10259,0.05
online_appraisal_flag,3834,0.02
color_appraisal,1318,0.01
mpg_highway,590,0.0
mpg_city,590,0.0
fuel_capacity_appraisal,312,0.0
color,33,0.0


In [11]:
# missingness correlations
temp = df.iloc[:, [i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0]]
corr_mat = temp.isnull().corr().round(decimals=2)
corr_mat2 = corr_mat.unstack().reset_index()
corr_mat2.columns = ['var1','var2','corr']
corr_mat2['var_1'] = corr_mat2[['var1','var2']].min(axis=1)
corr_mat2['var_2'] = corr_mat2[['var1','var2']].max(axis=1)
corr_mat2.drop(columns=['var1','var2'], inplace=True)
corr_mat2.drop_duplicates(inplace=True)
corr_mat2 = corr_mat2[corr_mat2['var_1'] != corr_mat2['var_2']]
# corr_mat2 = corr_mat2[corr_mat2['corr'] > 0.1]
corr_mat2.sort_values(by='corr', ascending=False).head(10)

Unnamed: 0,corr,var_1,var_2
80,1.0,mpg_city,mpg_highway
33,0.57,color_appraisal,online_appraisal_flag
71,0.12,model_appraisal,trim_appraisal
131,0.09,trim,trim_appraisal
10,0.06,body,trim
70,0.05,model_appraisal,trim
11,0.03,body,trim_appraisal
6,0.02,body,mpg_city
8,0.02,body,mpg_highway
57,0.02,fuel_capacity_appraisal,online_appraisal_flag


### Imputations

In [19]:
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.impute import IterativeImputer
#imp = IterativeImputer(max_iter=10, random_state=0)
#imp.fit(df)
#test = imp.transform(df)

#### Trim
- 16% of appraisals and 12% of purchases are missing trim

Imputation strategy:
- Use logistic regression to predict trim
- make, model, model_year, mileage, price

In [14]:
df.columns

Index(['body', 'body_appraisal', 'color', 'color_appraisal', 'cylinders',
       'cylinders_appraisal', 'engine', 'engine_appraisal', 'fuel_capacity',
       'fuel_capacity_appraisal', 'horsepower', 'horsepower_appraisal', 'make',
       'make_appraisal', 'market', 'mileage', 'mileage_appraisal', 'model',
       'model_appraisal', 'model_year', 'model_year_appraisal', 'mpg_city',
       'mpg_city_appraisal', 'mpg_highway', 'mpg_highway_appraisal',
       'online_appraisal_flag', 'price', 'price_appraisal', 'trim',
       'trim_appraisal'],
      dtype='object')

In [13]:
df['trim'].value_counts(dropna=False)

Not Premium    97920
Premium        78239
NaN            23841
Name: trim, dtype: int64

In [22]:
train = df[df['trim'].notna()][['make','model','model_year','mileage','price']]
test = df[df['trim'].isna()][['make','model','model_year','mileage','price']]

#### Body
Missing:
- Impute the body type based on the mode for make/model or just make (when needed)

Validation:
- This leaves unresolved the issue that there are multiple body types for a given make/model (runs counter to my expectation)

In [98]:
# mode of body by make/model
body_modes = df.groupby(['make','model'])['body'].agg(pd.Series.mode) # compute modes by make/model
na_indexer = df['body'].isna() # null obs
df.loc[na_indexer,'body'] = df.loc[na_indexer].apply(lambda x: body_modes.loc[x['make'],x['model']], axis=1) # apply modes by make/model
df.loc[df['body'].map(type) == np.ndarray,'body'] = np.nan # without mode set to null

# mode of body by make
# these are entire models that are missing body
body_modes = df.groupby(['make'])['body'].agg(pd.Series.mode) # compute modes by make
na_indexer = df['body'].isna() # null obs
df.loc[na_indexer,'body'] = df.loc[na_indexer].apply(lambda x: body_modes.loc[x['make']], axis=1) # apply modes by make

In [99]:
# mode of body by make/model
body_modes = df.groupby(['make','model'])['body_appraisal'].agg(pd.Series.mode) # compute modes by make/model
na_indexer = df['body_appraisal'].isna() # null obs
df.loc[na_indexer,'body_appraisal'] = df.loc[na_indexer].apply(lambda x: body_modes.loc[x['make'],x['model']], axis=1) # apply modes by make/model
df.loc[df['body_appraisal'].map(type) == np.ndarray,'body_appraisal'] = np.nan # without mode set to null

# mode of body by make
# these are entire models that are missing body
body_modes = df.groupby(['make'])['body_appraisal'].agg(pd.Series.mode) # compute modes by make
na_indexer = df['body_appraisal'].isna() # null obs
df.loc[na_indexer,'body_appraisal'] = df.loc[na_indexer].apply(lambda x: body_modes.loc[x['make']], axis=1) # apply modes by make

### Engine
Here I've discovered a reliability problem with engine data
- Make/model/engine should determine cylinders/horsepower/fuel_capacity/mpg_*

In [None]:
print(df.columns)

In [None]:
df_HXQ = df[df['model'] == 'HXQ_8']

In [None]:
df_HXQ.groupby(['make','model','trim','body','engine','cylinders'], dropna=False, as_index=False).size()

Seems OK to have different engines for the same make/model
- Cutoff for data entry error? (e.g. 2.4L below has 48 obs)

In [None]:
engines = df_HXQ.groupby(['make','model','engine'], dropna=False, as_index=False).size()
engines['pct'] = engines['size']/engines['size'].sum()
engines

In [None]:
df_HXQ.groupby(['make','model','engine','cylinders'], dropna=False, as_index=False).size()

In [None]:
df_HXQ.groupby(['make','model','engine','horsepower'], dropna=False, as_index=False).size()

Based on below, make/model/trim doesn't determine the other vehicle characteristics I think it would
- Data entry problem?

### Other Variables
- Identify areas of interest to pull out from `0_eda_profile_report.ipynb`