# Exploring eBay Car Sales Data

## Introduction

dataset
- used cars from *eBay Kleinanzeigen*, German eBay classifieds
- sample of 50,000 data points for this project

project goal
- clean dataset
- initial analysis

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)


### 1. read in file, prelim explore

In [2]:
autos = pd.read_csv('autos.csv', encoding='latin1')

In [3]:
autos[:5]

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


In [4]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateCrawled          50000 non-null  object
 1   name                 50000 non-null  object
 2   seller               50000 non-null  object
 3   offerType            50000 non-null  object
 4   price                50000 non-null  object
 5   abtest               50000 non-null  object
 6   vehicleType          44905 non-null  object
 7   yearOfRegistration   50000 non-null  int64 
 8   gearbox              47320 non-null  object
 9   powerPS              50000 non-null  int64 
 10  model                47242 non-null  object
 11  odometer             50000 non-null  object
 12  monthOfRegistration  50000 non-null  int64 
 13  fuelType             45518 non-null  object
 14  brand                50000 non-null  object
 15  notRepairedDamage    40171 non-null  object
 16  date

In [5]:
autos.describe()

Unnamed: 0,yearOfRegistration,powerPS,monthOfRegistration,nrOfPictures,postalCode
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,2005.07328,116.35592,5.72336,0.0,50813.6273
std,105.712813,209.216627,3.711984,0.0,25779.747957
min,1000.0,0.0,0.0,0.0,1067.0
25%,1999.0,70.0,3.0,0.0,30451.0
50%,2003.0,105.0,6.0,0.0,49577.0
75%,2008.0,150.0,9.0,0.0,71540.0
max,9999.0,17700.0,12.0,0.0,99998.0


##### missing data for `vehicleType`, `gearbox`, `model`, `fuelType`, `notRepairedDamage`
##### `brand`= American make

### 2. clean column names and values

In [6]:
# (1) CamelCase to snake_case, (2) descriptive names

autos.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

In [7]:
new_cols = ['date_crawled', 'car_name', 'seller_type', 'offer_type', 'price', 'ab_test','vehicle_type', 
            'registration_year', 'transmission', 'power_in_ps', 'model','mileage_km', 'registration_month', 
            'fuel_type', 'make','damage_not_repaired', 'ad_created', 'num_pictures', 'postal_code',
            'last_seen_online']

autos.columns = new_cols
autos[:5]


Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,ad_created,num_pictures,postal_code,last_seen_online
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


### 3. initial exploration and cleaning

In [8]:
# (1) cols numeric data, clean and convert to int, (2) drop cols same values 

autos.describe(include='all')

Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,ad_created,num_pictures,postal_code,last_seen_online
count,50000,50000,50000,50000,50000,50000,44905,50000.0,47320,50000.0,47242,50000,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,2,2,2357,2,8,,2,,245,13,,7,40,2,76,,,39481
top,2016-03-19 17:36:18,Ford_Fiesta,privat,Angebot,$0,test,limousine,,manuell,,golf,"150,000km",,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,49999,49999,1421,25756,12859,,36993,,4024,32424,,30107,10687,35232,1946,,,8
mean,,,,,,,,2005.07328,,116.35592,,,5.72336,,,,,0.0,50813.6273,
std,,,,,,,,105.712813,,209.216627,,,3.711984,,,,,0.0,25779.747957,
min,,,,,,,,1000.0,,0.0,,,0.0,,,,,0.0,1067.0,
25%,,,,,,,,1999.0,,70.0,,,3.0,,,,,0.0,30451.0,
50%,,,,,,,,2003.0,,105.0,,,6.0,,,,,0.0,49577.0,
75%,,,,,,,,2008.0,,150.0,,,9.0,,,,,0.0,71540.0,


#### (1) cols numeric data, clean and convert to int
clean data:

- mileage_km(odometer)          need to remove , and km, dtype=object
- date_crawled                  format=2016-03-26 17:47:46, convert to datetime, dtype=object
- price                         need to remove $ and , dtype=object
- registration_year             dtype=int64
- make                          convert to string, dtype=object

In [9]:
# price: remove $ and , and convert to int

autos['price'] = (autos['price']
                  .str.replace('$', '')
                  .str.replace(',', '')
                  .astype(int)
                 )
autos.rename({'price':'price'}, axis=1, inplace=True)
autos['price'].head()

0    5000
1    8500
2    8990
3    4350
4    1350
Name: price, dtype: int64

In [10]:
# mileage_km (odometer): remove km and , and convert to int

autos['mileage_km'] = (autos['mileage_km']
                             .str.replace('km','')
                             .str.replace(',','')
                             .astype(int)
                             )
autos.rename({'mileage_km':'mileage_km'}, axis=1, inplace=True)
autos['mileage_km'].head()

0    150000
1    150000
2     70000
3     70000
4    150000
Name: mileage_km, dtype: int64

In [11]:
autos[:2]

Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,ad_created,num_pictures,postal_code,last_seen_online
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,5000,control,bus,2004,manuell,158,andere,150000,3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,8500,control,limousine,1997,automatik,286,7er,150000,6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08


#### (2) drop cols same values 
cols with low unique:
- seller_type 2
- offer_type 2
- ab_test 2
- transmission 2
- fuel_type 7
- damage_not_repaired 2

In [12]:
#drop columns?

### 4. exploring mileage_km (odometer) and prive cols

In [13]:
# data quality check: mileage_km

autos['mileage_km'].unique().shape  #(13,)
autos['mileage_km'].describe()      # min 5,000 max 150,000
autos['mileage_km'].value_counts().sort_index(ascending=False)  # 967 cars w/500 km, 32,424 cars w/150,000


mileage_km
150000    32424
125000     5170
100000     2169
90000      1757
80000      1436
70000      1230
60000      1164
50000      1027
40000       819
30000       789
20000       784
10000       264
5000        967
Name: count, dtype: int64

In [14]:
# data quality check: price

print('shape:',autos['price'].unique().shape)  #(2357,)
print('describe:', autos['price'].describe())     # min 0.000000e+00, max 1.000000e+08
print('head:',autos['price'].value_counts().sort_index(ascending=True).head()) # 1,421 cars w/0 price
print('tail:',autos['price'].value_counts().sort_index(ascending=True).tail())


shape: (2357,)
describe: count    5.000000e+04
mean     9.840044e+03
std      4.811044e+05
min      0.000000e+00
25%      1.100000e+03
50%      2.950000e+03
75%      7.200000e+03
max      1.000000e+08
Name: price, dtype: float64
head: price
0    1421
1     156
2       3
3       1
5       2
Name: count, dtype: int64
tail: price
10000000    1
11111111    2
12345678    3
27322222    1
99999999    1
Name: count, dtype: int64


In [15]:
# remove cars with price over $350,000 (keep $1 as likely opening bid)

autos = autos[autos['price'].between(1,351000)]
autos['price'].describe()

count     48565.000000
mean       5888.935591
std        9059.854754
min           1.000000
25%        1200.000000
50%        3000.000000
75%        7490.000000
max      350000.000000
Name: price, dtype: float64

In [16]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48565 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   date_crawled         48565 non-null  object
 1   car_name             48565 non-null  object
 2   seller_type          48565 non-null  object
 3   offer_type           48565 non-null  object
 4   price                48565 non-null  int64 
 5   ab_test              48565 non-null  object
 6   vehicle_type         43979 non-null  object
 7   registration_year    48565 non-null  int64 
 8   transmission         46222 non-null  object
 9   power_in_ps          48565 non-null  int64 
 10  model                46107 non-null  object
 11  mileage_km           48565 non-null  int64 
 12  registration_month   48565 non-null  int64 
 13  fuel_type            44535 non-null  object
 14  make                 48565 non-null  object
 15  damage_not_repaired  39464 non-null  object
 16  ad_create

### 5. exploring date cols

In [17]:
autos[['date_crawled', 'ad_created','last_seen_online']][0:5]

Unnamed: 0,date_crawled,ad_created,last_seen_online
0,2016-03-26 17:47:46,2016-03-26 00:00:00,2016-04-06 06:45:54
1,2016-04-04 13:38:56,2016-04-04 00:00:00,2016-04-06 14:45:08
2,2016-03-26 18:57:24,2016-03-26 00:00:00,2016-04-06 20:15:37
3,2016-03-12 16:58:10,2016-03-12 00:00:00,2016-03-15 03:16:28
4,2016-04-01 14:38:50,2016-04-01 00:00:00,2016-04-01 14:38:50


In [18]:
#distribution of ad_created

ad_created_norm = (autos['ad_created']
                   .str[:10]
                   .value_counts(normalize=True, dropna=False)
                   .sort_index(ascending=False))

print(ad_created_norm[:5])
print('\n')
print(ad_created_norm[-5:])

ad_created
2016-04-07    0.001256
2016-04-06    0.003253
2016-04-05    0.011819
2016-04-04    0.036858
2016-04-03    0.038855
Name: proportion, dtype: float64


ad_created
2015-12-05    0.000021
2015-11-10    0.000021
2015-09-09    0.000021
2015-08-10    0.000021
2015-06-11    0.000021
Name: proportion, dtype: float64


In [19]:
#distribution of date_crawled

date_crawled_norm = (autos['date_crawled']
                     .str[:10]
                     .value_counts(normalize=True, dropna=False)
                     .sort_index())

print(date_crawled_norm[:5])
print('\n')
print(date_crawled_norm[-5:])

date_crawled
2016-03-05    0.025327
2016-03-06    0.014043
2016-03-07    0.036014
2016-03-08    0.033296
2016-03-09    0.033090
Name: proportion, dtype: float64


date_crawled
2016-04-03    0.038608
2016-04-04    0.036487
2016-04-05    0.013096
2016-04-06    0.003171
2016-04-07    0.001400
Name: proportion, dtype: float64


In [20]:
#distribution of last_seen_online

last_seen_norm = (autos['last_seen_online']
                  .str[:10]
                  .value_counts(normalize=True, dropna=False)
                  .sort_index())

print(last_seen_norm[:5])
print('\n')
print(last_seen_norm[-5:])

last_seen_online
2016-03-05    0.001071
2016-03-06    0.004324
2016-03-07    0.005395
2016-03-08    0.007413
2016-03-09    0.009595
Name: proportion, dtype: float64


last_seen_online
2016-04-03    0.025203
2016-04-04    0.024483
2016-04-05    0.124761
2016-04-06    0.221806
2016-04-07    0.131947
Name: proportion, dtype: float64


In [21]:
#distribution registration_year

reg_year_norm = (autos['registration_year']
                 .value_counts(normalize=True, dropna=False)
                 .sort_index())

print(reg_year_norm[:5])
print('\n')
print(reg_year_norm[-6:])

registration_year
1000    0.000021
1001    0.000021
1111    0.000021
1800    0.000041
1910    0.000103
Name: proportion, dtype: float64


registration_year
5000    0.000082
5911    0.000021
6200    0.000021
8888    0.000021
9000    0.000021
9999    0.000062
Name: proportion, dtype: float64


##### top 4 and bottom 5 years are not possible

### 6. dealing with incorrect registration year data

##### highest_year = 2015 
##### lowest_year = 1900

In [22]:
autos = autos[autos['registration_year'].between(1900,2015)]
autos['registration_year'].describe()

count    45461.000000
mean      2002.559491
std          6.949093
min       1910.000000
25%       1999.000000
50%       2003.000000
75%       2007.000000
max       2015.000000
Name: registration_year, dtype: float64

##### Most cars were registred between 1994 and 2005; registration is a proxy for model year
##### Cars registered before 1958 may be typos, that many vintage cars are unlikely to be auctioned on eBay

### 7. exploring price by brand

#### look at American made cars to see how well they resell on German eBay 

In [23]:
# list car makers

car_make = autos['make'].value_counts(normalize=True).sort_index()
car_unique = autos['make'].unique()
print(car_make)
# print(car_unique)

make
alfa_romeo        0.006665
audi              0.087240
bmw               0.111172
chevrolet         0.005763
chrysler          0.003541
citroen           0.014012
dacia             0.002662
daewoo            0.001518
daihatsu          0.002530
fiat              0.025362
ford              0.069708
honda             0.007831
hyundai           0.010075
jaguar            0.001606
jeep              0.002288
kia               0.007193
lada              0.000594
lancia            0.001100
land_rover        0.002112
mazda             0.015134
mercedes_benz     0.097336
mini              0.008777
mitsubishi        0.008183
nissan            0.015244
opel              0.106223
peugeot           0.029608
porsche           0.006181
renault           0.046567
rover             0.001320
saab              0.001672
seat              0.018103
skoda             0.016630
smart             0.014144
sonstige_autos    0.009877
subaru            0.002156
suzuki            0.005917
toyota            0.012

In [24]:
# mean price for American cars

# key = american car
# value = mean price
make_ame = ['jeep','ford','chrysler','chevrolet']
# print(make_ame)
american_mean_price = {}

for m in make_ame:
    ame_cars = autos[autos['make']==m]
    mean_price = ame_cars['price'].mean()
    american_mean_price[m] = int(mean_price)

american_mean_price

{'jeep': 11834, 'ford': 3817, 'chrysler': 3514, 'chevrolet': 6718}

##### American made cars on average sell, with the exception of Jeeps

In [25]:
# compare mean price to all makes 

car_labels = autos['make'].value_counts().index

# key = car make
# value = mean price
make_mean_price = {}

for m in car_labels:
    car_make = autos[autos['make']==m]    
    mean_price = car_make['price'].mean()
    make_mean_price[m] = int(mean_price)

# make_mean_price

##### Jeeps sell as well as luxury cars:
-  'jeep': price    11834
- 'sonstige_autos': price    11791
-  'jaguar': price    11635
-  'mini': price    10653
- 'mercedes_benz': price    8711

#### But less than Land Rovers:
- 'land_rover': price    18585





### 8. storing aggregate data in a dataframe

In [26]:
# pandas series and dataframe constructors

make_mean_price_series = pd.Series(make_mean_price)
df = pd.DataFrame(make_mean_price_series, columns=['mean_price'])
df.sort_values(by='mean_price', ascending=False, inplace=True)
print(df.head(6))
print(df.columns)

                mean_price
porsche              43659
land_rover           18585
jeep                 11834
sonstige_autos       11791
jaguar               11635
mini                 10653
Index(['mean_price'], dtype='object')


In [28]:
# 1. create dictionary: car make & mean price for top 6 makes
# key = car make (from car_labels)
# value = mean mileage_km

car_labels = df[:6].index

top_6_mean_price = {}

for m in car_labels:
    car_selection = autos[autos['make']==m]    
    mean_price = car_selection['price'].mean()
    top_6_mean_price[m] = int(mean_price)
# top_6_mean_price

# 2. create series
mean_price_series = pd.Series(top_6_mean_price)
print(mean_price_series)

porsche           43659
land_rover        18585
jeep              11834
sonstige_autos    11791
jaguar            11635
mini              10653
dtype: int64


In [29]:
# 1. create dictionary: car make & mean mileage for top 6 makes
# key = car make (from car_labels)
# value = mean mileage_km

car_labels = df[:6].index

top_6_mean_mileage = {}

for m in car_labels:
    car_selection = autos[autos['make']==m]    
    mean_mileage = car_selection['mileage_km'].mean()
    top_6_mean_mileage[m] = int(mean_mileage)
# top_6_mean_mileage 

# 2. create series
mean_mileage_series = pd.Series(top_6_mean_mileage)


In [30]:
# create dataframe mean_df for top 6 brands from mean price & mileage series

mean_df = pd.DataFrame(mean_price_series, columns=['mean_price'])
mean_df['mean_mileage'] = mean_mileage_series
mean_df

Unnamed: 0,mean_price,mean_mileage
porsche,43659,97544
land_rover,18585,120208
jeep,11834,126682
sonstige_autos,11791,90445
jaguar,11635,124178
mini,10653,88433


##### Porche have penultimate lowest mean mileage, but highest mean price
##### Sport utility vehicle (Land Rover & Jeep) have the high mean milleage and high mean prices