# CarMax Analytics Showcase
- Competition: https://analyticsshowcase.carmax.com
- Data: https://github.com/kmx-analytics-showcase/Winter-2023

In [1]:
# load packages
import numpy as np
import pandas as pd

## Data Load / Clean / Prep

In [2]:
# load data
file = 'data/data_winter2023.csv'
df = pd.read_csv(file
                ,dtype={'price':'category'
                        ,'appraisal_offer':'category'
                        ,'market':'category'
                        
                        ,'online_appraisal_flag':'Int8'
                        ,'model_year':'Int16'
                        ,'mileage':'category'
                        ,'make':'category'
                        #,'model':'category'
                        #,'trim_descrip'
                        ,'body':'category'
                        ,'color':'category'
                        #,'engine':'category'
                        ,'cylinders':'Int8'
                        ,'mpg_city':'Int8'
                        ,'mpg_highway':'Int8'
                        ,'horsepower':'Int16'
                        ,'fuel_capacity':'Int8'

                        ,'model_year_appraisal':'Int16'
                        ,'mileage_appraisal':'category'
                        ,'make_appraisal':'category'
                        #,'model_appraisal':'category'
                        #,'trim_descrip_appraisal'
                        ,'body_appraisal':'category'
                        ,'color_appraisal':'category'
                        #,'engine_appraisal':'category'
                        ,'cylinders_appraisal':'Int8'
                        ,'mpg_city_appraisal':'Int8'
                        ,'mpg_highway_appraisal':'Int8'
                        ,'horsepower_appraisal':'Int16'
                        ,'fuel_capacity_appraisal':'Int8'
                }
                )
df.shape

(200000, 30)

In [3]:
# basic var manipulations

# rename some fields
df.rename(columns={'trim_descrip':'premium_trim'
                  ,'trim_descrip_appraisal':'premium_trim_appraisal'
                  ,'appraisal_offer':'price_appraisal'
                  }, inplace=True)

# recode trim
# NOTE: assumes null is not premium
df['premium_trim'] = np.where(df['premium_trim'] == 'Premium', 1, 0)
df['premium_trim_appraisal'] = np.where(df['premium_trim_appraisal'] == 'Premium', 1, 0)

# move market
market = df.pop('market')
df.insert(3, 'market', market)

In [4]:
# make prices categorical with order
from pandas.api.types import CategoricalDtype
appraisal_cat = CategoricalDtype(
    ['$0k to $5k', '$5k to $10k', '$10k to $15k', '$15k to $20k', 
    '$20k to $25k', '$25k to $30k', '$30k to $35k', '$35k to $40k', 
    '$40k+'], 
    ordered=True
)
price_cat = CategoricalDtype(
    ['$0 to $15k', '$15k to $20k', '$20k to $25k', '$25k to $30k',
       '$30k to $35k', '$35k to $40k', '$40k to $45k', '$45k to $50k',
       '$50k to $55k', '$55k to $60k', '$60k to $65k', '$65k to $70k',
       '$70k+'], 
    ordered=True
)
df['price_appraisal'] = df['price_appraisal'].astype(appraisal_cat)
df['price'] = df['price'].astype(price_cat)

In [5]:
df.info(show_counts=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 30 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   price                    category
 1   price_appraisal          category
 2   online_appraisal_flag    Int8    
 3   market                   category
 4   model_year               Int16   
 5   mileage                  category
 6   make                     category
 7   model                    object  
 8   premium_trim             int64   
 9   body                     category
 10  color                    category
 11  engine                   object  
 12  cylinders                Int8    
 13  mpg_city                 Int8    
 14  mpg_highway              Int8    
 15  horsepower               Int16   
 16  fuel_capacity            Int8    
 17  model_year_appraisal     Int16   
 18  mileage_appraisal        category
 19  make_appraisal           category
 20  model_appraisal          o

In [6]:
df.nunique()

price                       13
price_appraisal              9
online_appraisal_flag        2
market                      16
model_year                  24
mileage                     21
make                        37
model                      575
premium_trim                 2
body                         6
color                       16
engine                      49
cylinders                   10
mpg_city                    47
mpg_highway                 41
horsepower                 297
fuel_capacity               36
model_year_appraisal        32
mileage_appraisal           21
make_appraisal              38
model_appraisal            675
premium_trim_appraisal       2
body_appraisal               8
color_appraisal             16
engine_appraisal            56
cylinders_appraisal          6
mpg_city_appraisal          50
mpg_highway_appraisal       44
horsepower_appraisal       339
fuel_capacity_appraisal     35
dtype: int64

In [7]:
# numeric
df.describe().round(decimals = 1).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
online_appraisal_flag,196166.0,0.3,0.4,0.0,0.0,0.0,1.0,1.0
model_year,200000.0,2012.6,2.1,1991.0,2012.0,2013.0,2014.0,2017.0
premium_trim,200000.0,0.4,0.5,0.0,0.0,0.0,1.0,1.0
cylinders,200000.0,5.0,1.4,0.0,4.0,4.0,6.0,16.0
mpg_city,199410.0,22.2,5.2,11.0,18.0,22.0,26.0,60.0
mpg_highway,199410.0,29.4,5.7,14.0,25.0,29.0,33.0,59.0
horsepower,200000.0,237.0,76.6,11.0,175.0,228.0,290.0,760.0
fuel_capacity,199996.0,17.6,4.3,1.0,14.0,17.0,19.0,48.0
model_year_appraisal,200000.0,2007.8,4.9,1986.0,2005.0,2009.0,2012.0,2017.0
premium_trim_appraisal,200000.0,0.3,0.5,0.0,0.0,0.0,1.0,1.0


In [8]:
# non-numeric
df.describe(include = ['object','category']).T

Unnamed: 0,count,unique,top,freq
price,200000,13,$20k to $25k,46388
price_appraisal,200000,9,$0k to $5k,69180
market,200000,16,14,19729
mileage,200000,21,25k to 30k miles,18740
make,200000,37,AIH,26201
model,200000,575,HXQ_8,5528
body,189741,6,E,106730
color,199967,16,White,45198
engine,200000,49,2.0L,36654
mileage_appraisal,200000,21,50k to 60k miles,14163


In [9]:
# data sample
df.sample(5).T

Unnamed: 0,110054,77982,108325,104591,124858
price,$35k to $40k,$25k to $30k,$35k to $40k,$25k to $30k,$25k to $30k
price_appraisal,$0k to $5k,$0k to $5k,$10k to $15k,$15k to $20k,$5k to $10k
online_appraisal_flag,0,0,0,0,0
market,10,5,11,14,12
model_year,2013,2011,2014,2012,2014
mileage,25k to 30k miles,55k to 60k miles,35k to 40k miles,30k to 35k miles,30k to 35k miles
make,CIP,LYP,JDZ,AIH,FVQ
model,CIP_31,LYP_8,JDZ_20,AIH_5,FVQ_8
premium_trim,0,1,0,0,0
body,E,E,E,E,E


### Missingness
- Analyze missing/null values
- Impute values where appropriate
- Drop observations where necessary

In [10]:
# null counts
pd.DataFrame({'count':df.isna().sum()
             ,'percent':(df.isna().sum()/df.shape[0]).round(decimals=2)}
    ).sort_values(by='count', ascending=False)

Unnamed: 0,count,percent
model_appraisal,21508,0.11
body,10259,0.05
online_appraisal_flag,3834,0.02
color_appraisal,1318,0.01
mpg_highway,590,0.0
mpg_city,590,0.0
fuel_capacity_appraisal,312,0.0
color,33,0.0
fuel_capacity,4,0.0
mpg_city_appraisal,1,0.0


In [11]:
# missingness correlations
temp = df.iloc[:, [i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0]]
corr_mat = temp.isnull().corr().round(decimals=2)
corr_mat2 = corr_mat.unstack().reset_index()
corr_mat2.columns = ['var1','var2','corr']
corr_mat2['var_1'] = corr_mat2[['var1','var2']].min(axis=1)
corr_mat2['var_2'] = corr_mat2[['var1','var2']].max(axis=1)
corr_mat2.drop(columns=['var1','var2'], inplace=True)
corr_mat2.drop_duplicates(inplace=True)
corr_mat2 = corr_mat2[corr_mat2['var_1'] != corr_mat2['var_2']]
# corr_mat2 = corr_mat2[corr_mat2['corr'] > 0.1]
corr_mat2.sort_values(by='corr', ascending=False).head(10)

Unnamed: 0,corr,var_1,var_2
34,1.0,mpg_city,mpg_highway
7,0.57,color_appraisal,online_appraisal_flag
9,0.02,fuel_capacity_appraisal,online_appraisal_flag
14,0.02,body,mpg_highway
13,0.02,body,mpg_city
12,0.01,body,color
26,0.01,color,model_appraisal
16,0.01,body,model_appraisal
68,0.01,model_appraisal,mpg_city_appraisal
79,0.01,color_appraisal,fuel_capacity_appraisal


### Imputations

#### Trim
- 16% of appraisals and 12% of purchases were missing trim
- Imputation: assumed not premium (see previous code near top of notebook)

### Appraisal Model
- 11% of appraisals are missing model
- Imputation: assume the most common model for each make

In [12]:
# count nulls
print(df['model_appraisal'].isna().sum())

21508


In [13]:
# create imputed flag
df['imputed_flag_model_appraisal'] = np.where(df['model_appraisal'].isna(), 1, 0)
print(df['imputed_flag_model_appraisal'].sum())

21508


In [14]:
# mode of model by make
# TODO: improve to mode by make/year
model_modes = df.groupby(['make_appraisal'])['model_appraisal'].agg(pd.Series.mode) # compute modes by make
na_indexer = df['model_appraisal'].isna() # null obs
df.loc[na_indexer,'model_appraisal'] = df.loc[na_indexer].apply(lambda x: model_modes.loc[x['make_appraisal']], axis=1) # apply modes by make
df.loc[df['model_appraisal'].map(type) == np.ndarray,'model_appraisal'] = np.nan # without mode set to null

In [15]:
# check result
print(df['model_appraisal'].isna().sum())

0


#### Body
- 5% of body is missing
- Imputation: assume mode of body for each make

In [16]:
# count nulls
print(df['body'].isna().sum())

10259


In [17]:
# create imputed flag
df['imputed_flag_body'] = np.where(df['body'].isna(), 1, 0)
print(df['imputed_flag_body'].sum())

10259


In [18]:
# mode of body by make
# TODO: improve to mode by make/model
body_modes = df.groupby(['make'])['body'].agg(pd.Series.mode) # compute modes by make
na_indexer = df['body'].isna() # null obs
df.loc[na_indexer,'body'] = df.loc[na_indexer].apply(lambda x: body_modes.loc[x['make']], axis=1) # apply modes by make
df.loc[df['body'].map(type) == np.ndarray,'body'] = np.nan # without mode set to null

In [19]:
# check result
print(df['body'].isna().sum())

0


#### Online Appraisal Flag
- 2% of online appraisal flag is missing
- Imputation: assume not online
- Note: inconsequential, not flagging imputation

In [20]:
# count nulls
print(df['online_appraisal_flag'].isna().sum())

3834


In [21]:
# assume null is not online appraisal
df['online_appraisal_flag'].fillna(0, inplace=True)

In [22]:
# check result
print(df['online_appraisal_flag'].isna().sum())

0


#### Color
- 1% and <1% of color_appraisal and color are missing
- Imputation: replace with most common colors
- Note: inconsequential, so not flagging

In [23]:
# count nulls
print(df['color_appraisal'].isna().sum())
print(df['color'].isna().sum())

1318
33


In [24]:
# modes
print(df['color_appraisal'].mode()[0])
print(df['color'].mode()[0])

# impute
df['color_appraisal'].fillna(df['color_appraisal'].mode()[0], inplace=True)
df['color'].fillna(df['color'].mode()[0], inplace=True)

Black
White


In [25]:
# count nulls
print(df['color_appraisal'].isna().sum())
print(df['color'].isna().sum())

0
0


#### MPGs and Fuel Capacities
- <1% each
- Imputation: assume average
- Note: inconsequential, so not flagging

In [26]:
# count nulls
print(df['mpg_city'].isna().sum())
print(df['mpg_highway'].isna().sum())
print(df['fuel_capacity_appraisal'].isna().sum())
print(df['fuel_capacity'].isna().sum())
print(df['mpg_city_appraisal'].isna().sum())

590
590
312
4
1


In [28]:
# means
print(df['mpg_city'].mean())
print(df['mpg_highway'].mean())
print(df['fuel_capacity_appraisal'].mean())
print(df['fuel_capacity'].mean())
print(df['mpg_city_appraisal'].mean())

# impute
df['mpg_city'].fillna(df['mpg_city'].mean().astype('int'), inplace=True)
df['mpg_highway'].fillna(df['mpg_highway'].mean().astype('int'), inplace=True)
df['fuel_capacity_appraisal'].fillna(df['fuel_capacity_appraisal'].mean().astype('int'), inplace=True)
df['fuel_capacity'].fillna(df['fuel_capacity'].mean().astype('int'), inplace=True)
df['mpg_city_appraisal'].fillna(df['mpg_city_appraisal'].mean().astype('int'), inplace=True)

22.172117747354697
29.376671179980942
17.56572753495453
17.579301586031722
21.679143395716977


In [29]:
# count nulls
print(df['mpg_city'].isna().sum())
print(df['mpg_highway'].isna().sum())
print(df['fuel_capacity_appraisal'].isna().sum())
print(df['fuel_capacity'].isna().sum())
print(df['mpg_city_appraisal'].isna().sum())

0
0
0
0
0


## Validation

In [30]:
# would think these counts would stop increasing as a make/model/trim has same features
print(len(df[['make','model']].value_counts(dropna=False)))
print(len(df[['make','model','premium_trim']].value_counts(dropna=False)))
print(len(df[['make','model','premium_trim','body']].value_counts(dropna=False)))
print(len(df[['make','model','premium_trim','body','engine']].value_counts(dropna=False)))
print(len(df[['make','model','premium_trim','body','engine','cylinders']].value_counts(dropna=False)))
print(len(df[['make','model','premium_trim','body','engine','cylinders','horsepower']].value_counts(dropna=False)))
print(len(df[['make','model','premium_trim','body','engine','cylinders','horsepower','fuel_capacity']].value_counts(dropna=False)))

575
874
1013
1563
2474
3356
3736


### Engine
Here I've discovered a reliability problem with engine data
- Make/model/engine should determine cylinders/horsepower/fuel_capacity/mpg_*

In [31]:
print(df.columns)

Index(['price', 'price_appraisal', 'online_appraisal_flag', 'market',
       'model_year', 'mileage', 'make', 'model', 'premium_trim', 'body',
       'color', 'engine', 'cylinders', 'mpg_city', 'mpg_highway', 'horsepower',
       'fuel_capacity', 'model_year_appraisal', 'mileage_appraisal',
       'make_appraisal', 'model_appraisal', 'premium_trim_appraisal',
       'body_appraisal', 'color_appraisal', 'engine_appraisal',
       'cylinders_appraisal', 'mpg_city_appraisal', 'mpg_highway_appraisal',
       'horsepower_appraisal', 'fuel_capacity_appraisal',
       'imputed_flag_model_appraisal', 'imputed_flag_body'],
      dtype='object')


In [32]:
df_HXQ = df[df['model'] == 'HXQ_8']

In [33]:
df_HXQ.groupby(['make','model','premium_trim','body','engine','cylinders'], dropna=False, as_index=False).size()

Unnamed: 0,make,model,premium_trim,body,engine,cylinders,size
0,AIH,HXQ_8,0,A,1.5L,0,0
1,AIH,HXQ_8,0,A,1.5L,2,0
2,AIH,HXQ_8,0,A,1.5L,3,0
3,AIH,HXQ_8,0,A,1.5L,4,0
4,AIH,HXQ_8,0,A,1.5L,5,0
...,...,...,...,...,...,...,...
12427,BIS,HXQ_8,1,G,2.4L,3,0
12428,BIS,HXQ_8,1,G,2.4L,4,0
12429,BIS,HXQ_8,1,G,2.4L,5,0
12430,BIS,HXQ_8,1,G,2.4L,6,0


Seems OK to have different engines for the same make/model
- Cutoff for data entry error? (e.g. 2.4L below has 48 obs)

In [34]:
engines = df_HXQ.groupby(['make','model','engine'], dropna=False, as_index=False).size()
engines['pct'] = engines['size']/engines['size'].sum()
engines

Unnamed: 0,make,model,engine,size,pct
0,AIH,HXQ_8,1.5L,0,0.0
1,AIH,HXQ_8,1.8L,0,0.0
2,AIH,HXQ_8,2.0L,0,0.0
3,AIH,HXQ_8,2.4L,0,0.0
4,ARU,HXQ_8,1.5L,0,0.0
...,...,...,...,...,...
143,FJR,HXQ_8,2.4L,0,0.0
144,BIS,HXQ_8,1.5L,0,0.0
145,BIS,HXQ_8,1.8L,0,0.0
146,BIS,HXQ_8,2.0L,0,0.0


In [35]:
df_HXQ.groupby(['make','model','engine','cylinders'], dropna=False, as_index=False).size()

Unnamed: 0,make,model,engine,cylinders,size
0,AIH,HXQ_8,1.5L,0,0
1,AIH,HXQ_8,1.5L,2,0
2,AIH,HXQ_8,1.5L,3,0
3,AIH,HXQ_8,1.5L,4,0
4,AIH,HXQ_8,1.5L,5,0
...,...,...,...,...,...
1031,BIS,HXQ_8,2.4L,3,0
1032,BIS,HXQ_8,2.4L,4,0
1033,BIS,HXQ_8,2.4L,5,0
1034,BIS,HXQ_8,2.4L,6,0


In [36]:
df_HXQ.groupby(['make','model','engine','horsepower'], dropna=False, as_index=False).size()

Unnamed: 0,make,model,engine,horsepower,size
0,AIH,HXQ_8,1.5L,140,0
1,AIH,HXQ_8,1.5L,143,0
2,AIH,HXQ_8,1.5L,158,0
3,AIH,HXQ_8,1.5L,174,0
4,AIH,HXQ_8,1.5L,180,0
...,...,...,...,...,...
1179,BIS,HXQ_8,2.4L,174,0
1180,BIS,HXQ_8,2.4L,180,0
1181,BIS,HXQ_8,2.4L,201,0
1182,BIS,HXQ_8,2.4L,205,0


Based on below, make/model/trim doesn't determine the other vehicle characteristics I think it would
- Data entry problem?

### Other Variables
- Identify areas of interest to pull out from `0_eda_profile_report.ipynb`

## Write Out Cleaned/Validated Data

In [37]:
file = 'data/cleaned_data.pkl'
df.to_pickle(file)