# Exploring eBay Car Sales Data

## Introduction

dataset
- used cars from *eBay Kleinanzeigen*, German eBay classifieds
- sample of 50,000 data points for this project

project goal
- clean dataset
- initial analysis

In [1]:
import numpy as np
import pandas as pd

### 1. read in file, prelim explore

In [2]:
autos = pd.read_csv('autos.csv', encoding='latin1')

In [3]:
autos[:5]

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


In [4]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateCrawled          50000 non-null  object
 1   name                 50000 non-null  object
 2   seller               50000 non-null  object
 3   offerType            50000 non-null  object
 4   price                50000 non-null  object
 5   abtest               50000 non-null  object
 6   vehicleType          44905 non-null  object
 7   yearOfRegistration   50000 non-null  int64 
 8   gearbox              47320 non-null  object
 9   powerPS              50000 non-null  int64 
 10  model                47242 non-null  object
 11  odometer             50000 non-null  object
 12  monthOfRegistration  50000 non-null  int64 
 13  fuelType             45518 non-null  object
 14  brand                50000 non-null  object
 15  notRepairedDamage    40171 non-null  object
 16  date

##### missing data for `vehicleType`, `gearbox`, `model`, `fuelType`, `notRepairedDamage`
##### `brand`= American make

### 2. clean column names and values

In [5]:
# (1) CamelCase to snake_case, (2) descriptive names

autos.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

In [20]:
new_cols = ['date_crawled', 'car_name', 'seller_type', 'offer_type', 'price', 'ab_test','vehicle_type', 
            'registration_year', 'transmission', 'power_in_ps', 'model','mileage_km', 'registration_month', 
            'fuel_type', 'make','damage_not_repaired', 'ad_created', 'num_pictures', 'postal_code',
            'last_seen_online']

autos.columns = new_cols
autos


Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,ad_created,num_pictures,postal_code,last_seen_online
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,5000,control,bus,2004,manuell,158,andere,150000,3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,8500,control,limousine,1997,automatik,286,7er,150000,6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,8990,test,limousine,2009,manuell,102,golf,70000,7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,4350,control,kleinwagen,2007,automatik,71,fortwo,70000,6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,1350,test,kombi,2003,manuell,0,focus,150000,7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2016-03-27 14:38:19,Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon,privat,Angebot,24900,control,limousine,2011,automatik,239,q5,100000,1,diesel,audi,nein,2016-03-27 00:00:00,0,82131,2016-04-01 13:47:40
49996,2016-03-28 10:50:25,Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+...,privat,Angebot,1980,control,cabrio,1996,manuell,75,astra,150000,5,benzin,opel,nein,2016-03-28 00:00:00,0,44807,2016-04-02 14:18:02
49997,2016-04-02 14:44:48,Fiat_500_C_1.2_Dualogic_Lounge,privat,Angebot,13200,test,cabrio,2014,automatik,69,500,5000,11,benzin,fiat,nein,2016-04-02 00:00:00,0,73430,2016-04-04 11:47:27
49998,2016-03-08 19:25:42,Audi_A3_2.0_TDI_Sportback_Ambition,privat,Angebot,22900,control,kombi,2013,manuell,150,a3,40000,11,diesel,audi,nein,2016-03-08 00:00:00,0,35683,2016-04-05 16:45:07


### 3. initial exploration and cleaning

In [21]:
# (1) cols numeric data, clean and convert to int, (2) drop cols same values 

autos.describe(include='all')

Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,ad_created,num_pictures,postal_code,last_seen_online
count,50000,50000,50000,50000,50000.0,50000,44905,50000.0,47320,50000.0,47242,50000.0,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,2,2,,2,8,,2,,245,,,7,40,2,76,,,39481
top,2016-03-19 17:36:18,Ford_Fiesta,privat,Angebot,,test,limousine,,manuell,,golf,,,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,49999,49999,,25756,12859,,36993,,4024,,,30107,10687,35232,1946,,,8
mean,,,,,9840.044,,,2005.07328,,116.35592,,125732.7,5.72336,,,,,0.0,50813.6273,
std,,,,,481104.4,,,105.712813,,209.216627,,40042.211706,3.711984,,,,,0.0,25779.747957,
min,,,,,0.0,,,1000.0,,0.0,,5000.0,0.0,,,,,0.0,1067.0,
25%,,,,,1100.0,,,1999.0,,70.0,,125000.0,3.0,,,,,0.0,30451.0,
50%,,,,,2950.0,,,2003.0,,105.0,,150000.0,6.0,,,,,0.0,49577.0,
75%,,,,,7200.0,,,2008.0,,150.0,,150000.0,9.0,,,,,0.0,71540.0,


#### (1) cols numeric data, clean and convert to int
clean data:

- mileage_km(odometer)          need to remove , and km, dtype=object
- date_crawled                  format=2016-03-26 17:47:46, convert to datetime, dtype=object
- price                         need to remove $ and , dtype=object
- registration_year             dtype=int64
- make                          convert to string, dtype=object

In [37]:
# price: remove $ and , and convert to int

autos['price'] = (autos['price']
                  .str.replace('$', '')
                  .str.replace(',', '')
                  .astype(int)
                 )
autos.rename({'price':'price'}, axis=1, inplace=True)
autos['price'].head()

In [28]:
# mileage_km (odometer): remove km and , and convert to int

autos['mileage_km'] = (autos['mileage_km']
                             .str.replace('km','')
                             .str.replace(',','')
                             .astype(int)
                             )
autos.rename({'mileage_km':'mileage_km'}, axis=1, inplace=True)
autos['mileage_km'].head()

In [30]:
autos[:2]

Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,ad_created,num_pictures,postal_code,last_seen_online
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,5000,control,bus,2004,manuell,158,andere,150000,3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,8500,control,limousine,1997,automatik,286,7er,150000,6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08


#### (2) drop cols same values 
cols with low unique:
- seller_type 2
- offer_type 2
- ab_test 2
- transmission 2
- fuel_type 7
- damage_not_repaired 2

In [31]:
#drop columns?

### 4. exploring mileage_km (odometer) and prive cols

In [32]:
# data quality check: mileage_km

autos['mileage_km'].unique().shape  #(13,)
autos['mileage_km'].describe()      # min 5,000 max 150,000
autos['mileage_km'].value_counts().sort_index(ascending=False)  # 967 cars w/500 km, 32,424 cars w/150,000


mileage_km
150000    32424
125000     5170
100000     2169
90000      1757
80000      1436
70000      1230
60000      1164
50000      1027
40000       819
30000       789
20000       784
10000       264
5000        967
Name: count, dtype: int64

In [33]:
# data quality check: price

autos['price'].unique().shape  #(2357,)
autos['price'].describe()      # min 0.000000e+00, max 1.000000e+08
autos['price'].value_counts().sort_index(ascending=True) # 1,421 cars w/0 price

price
0           1421
1            156
2              3
3              1
5              2
            ... 
10000000       1
11111111       2
12345678       3
27322222       1
99999999       1
Name: count, Length: 2357, dtype: int64

In [44]:
# remove cars with price over $350,000 (keep $1 as likely opening bid

autos = autos[autos['price'].between(1,351000)]
autos['price'].describe()

count     48565.000000
mean       5888.935591
std        9059.854754
min           1.000000
25%        1200.000000
50%        3000.000000
75%        7490.000000
max      350000.000000
Name: price, dtype: float64

In [45]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48565 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   date_crawled         48565 non-null  object
 1   car_name             48565 non-null  object
 2   seller_type          48565 non-null  object
 3   offer_type           48565 non-null  object
 4   price                48565 non-null  int64 
 5   ab_test              48565 non-null  object
 6   vehicle_type         43979 non-null  object
 7   registration_year    48565 non-null  int64 
 8   transmission         46222 non-null  object
 9   power_in_ps          48565 non-null  int64 
 10  model                46107 non-null  object
 11  mileage_km           48565 non-null  int64 
 12  registration_month   48565 non-null  int64 
 13  fuel_type            44535 non-null  object
 14  make                 48565 non-null  object
 15  damage_not_repaired  39464 non-null  object
 16  ad_create

### 5. exploring date cols

In [38]:
autos[['date_crawled', 'ad_created','last_seen_online']][0:5]

Unnamed: 0,date_crawled,ad_created,last_seen_online
0,2016-03-26 17:47:46,2016-03-26 00:00:00,2016-04-06 06:45:54
1,2016-04-04 13:38:56,2016-04-04 00:00:00,2016-04-06 14:45:08
2,2016-03-26 18:57:24,2016-03-26 00:00:00,2016-04-06 20:15:37
3,2016-03-12 16:58:10,2016-03-12 00:00:00,2016-03-15 03:16:28
4,2016-04-01 14:38:50,2016-04-01 00:00:00,2016-04-01 14:38:50


In [53]:
#distribution of ad_created

(autos['ad_created']
 .str[:10]
 .value_counts(normalize=True, dropna=False)
 .sort_index(ascending=False)
)

ad_created
2016-04-07    0.001256
2016-04-06    0.003253
2016-04-05    0.011819
2016-04-04    0.036858
2016-04-03    0.038855
                ...   
2015-12-05    0.000021
2015-11-10    0.000021
2015-09-09    0.000021
2015-08-10    0.000021
2015-06-11    0.000021
Name: proportion, Length: 76, dtype: float64

In [51]:
#distribution of date_crawled

(autos['date_crawled']
 .str[:10]
 .value_counts(normalize=True, dropna=False)
 .sort_index()
)

date_crawled
2016-03-05    0.025327
2016-03-06    0.014043
2016-03-07    0.036014
2016-03-08    0.033296
2016-03-09    0.033090
2016-03-10    0.032184
2016-03-11    0.032575
2016-03-12    0.036920
2016-03-13    0.015670
2016-03-14    0.036549
2016-03-15    0.034284
2016-03-16    0.029610
2016-03-17    0.031628
2016-03-18    0.012911
2016-03-19    0.034778
2016-03-20    0.037887
2016-03-21    0.037373
2016-03-22    0.032987
2016-03-23    0.032225
2016-03-24    0.029342
2016-03-25    0.031607
2016-03-26    0.032204
2016-03-27    0.031092
2016-03-28    0.034860
2016-03-29    0.034099
2016-03-30    0.033687
2016-03-31    0.031834
2016-04-01    0.033687
2016-04-02    0.035478
2016-04-03    0.038608
2016-04-04    0.036487
2016-04-05    0.013096
2016-04-06    0.003171
2016-04-07    0.001400
Name: proportion, dtype: float64

In [50]:
#distribution of last_seen_online

(autos['last_seen_online']
 .str[:10]
 .value_counts(normalize=True, dropna=False)
 .sort_index()
)

last_seen_online
2016-03-05    0.001071
2016-03-06    0.004324
2016-03-07    0.005395
2016-03-08    0.007413
2016-03-09    0.009595
2016-03-10    0.010666
2016-03-11    0.012375
2016-03-12    0.023783
2016-03-13    0.008895
2016-03-14    0.012602
2016-03-15    0.015876
2016-03-16    0.016452
2016-03-17    0.028086
2016-03-18    0.007351
2016-03-19    0.015834
2016-03-20    0.020653
2016-03-21    0.020632
2016-03-22    0.021373
2016-03-23    0.018532
2016-03-24    0.019767
2016-03-25    0.019211
2016-03-26    0.016802
2016-03-27    0.015649
2016-03-28    0.020859
2016-03-29    0.022341
2016-03-30    0.024771
2016-03-31    0.023783
2016-04-01    0.022794
2016-04-02    0.024915
2016-04-03    0.025203
2016-04-04    0.024483
2016-04-05    0.124761
2016-04-06    0.221806
2016-04-07    0.131947
Name: proportion, dtype: float64

In [56]:
#distribution registration_year

(autos['registration_year']
 .value_counts(normalize=True, dropna=False)
 .sort_index()
)

registration_year
1000    0.000021
1001    0.000021
1111    0.000021
1800    0.000041
1910    0.000103
          ...   
5911    0.000021
6200    0.000021
8888    0.000021
9000    0.000021
9999    0.000062
Name: proportion, Length: 95, dtype: float64

##### top 4 and bottom 5 years are not possible