In [None]:
####################################################################
# Prepared for Gabor's Data Analysis
#
# Data Analysis for Business, Economics, and Policy
# by Gabor Bekes and  Gabor Kezdi
# Cambridge University Press 2021
#
# gabors-data-analysis.com
#
# License: Free to share, modify and use for educational purposes.
# 	Not to be used for commercial purposes.
#
####################################################################

####################################################################
# used-car-la dataset
#
# input:
#       used_cars_2cities.csv

# output:
#       used-cars_2cities_prep_py.csv

# version 1.0   2021-06-01
####################################################################

In [149]:
### SETTING UP DIRECTORIES

# import packages
import pandas as pd
import os
import numpy as np

# set working directory for da_data_repo -- replace the
os.chdir("C:/workspace/stata/")

# location folders
data_in = "./used-cars/raw/"
data_out = "./used-cars/clean/"

In [150]:
#load dataset
df = pd.read_csv(
    data_in + "used_cars_2cities.csv",
    quotechar='"',
    delimiter=",",
    encoding="utf-8",
)

In [151]:
#check for duplicates
df.drop(
    columns=["v1"],
    inplace=True,
)

df.drop_duplicates(keep="first", inplace=True)

In [152]:
#gen intiger and logarithmic price variables
df.rename(
    columns={
        "price": "pricestr",
    },
    inplace=True,
)

df.dropna(subset=["pricestr"], inplace=True)

df["price"] = df.pricestr.str.strip("$ \n\t").astype("int")

df["lnprice"] = np.log(df.price)

In [155]:
#gen year and age variables from name string
df["year"] = df.name.str.split(" ", expand=True,)[0].astype("int")
    
df["age"] = 2017 - df.year + 1    

In [139]:
#filter by odometer

df["odometer"] = df.odometer / 10000

df.drop(df.loc[(df.odometer < 1) & (df.age >= 3)].index, inplace=True)

#fill missing gaps by mean of age groups

df["odometer"] = df.odometer
    .fillna(df.groupby("age")["odometer"]
    .transform("mean"))

df["lnodometer"] = np.log(df.odometer)

In [127]:
#general stats of major variables

df[["price", "lnprice", "age", "odometer"]].describe()

Unnamed: 0,price,lnprice,age,odometer
count,549.0,549.0,549.0,549.0
mean,5866.153005,8.299096,12.183971,13.031974
std,5657.673934,1.018207,5.887079,7.197456
min,1.0,0.0,1.0,0.07
25%,2400.0,7.783224,8.0,8.304973
50%,4300.0,8.36637,12.0,13.6559
75%,8299.0,9.02389,16.0,16.450363
max,90002.0,11.407587,27.0,118.0


In [144]:
#generate feature dummy variables

df["LE"] = df.name.str.lower().str.contains(" le").astype(int)
df["XLE"] = df.name.str.lower().str.contains(" xle").astype(int)
df["SE"] = df.name.str.lower().str.contains(" se").astype(int)
df["Hybrid"] = df.name.str.lower().str.contains(" hybrid").astype(int)

In [145]:
#general stats of dummy variables

df[["LE", "SE", "XLE", "Hybrid"]].describe()

Unnamed: 0,LE,SE,XLE,Hybrid
count,578.0,578.0,578.0,578.0
mean,0.197232,0.088235,0.072664,0.043253
std,0.398253,0.283882,0.25981,0.203601
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0


In [148]:
#save data

df.to_csv(data_out + "used-cars_2cities_prep_py.csv", index=False)

PermissionError: [Errno 13] Permission denied: './used-cars/clean/used-cars_2cities_prep_py.csv'