# Exploring eBay Car Sales Data

## Introduction

dataset
- used cars from *eBay Kleinanzeigen*, German eBay classifieds
- sample of 50,000 data points for this project

project goal
- clean dataset
- initial analysis

In [1]:
import numpy as np
import pandas as pd

### 1. read in file, prelim explore

In [2]:
autos = pd.read_csv('autos.csv', encoding='latin1')

In [3]:
autos[:5]

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,odometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50


In [4]:
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateCrawled          50000 non-null  object
 1   name                 50000 non-null  object
 2   seller               50000 non-null  object
 3   offerType            50000 non-null  object
 4   price                50000 non-null  object
 5   abtest               50000 non-null  object
 6   vehicleType          44905 non-null  object
 7   yearOfRegistration   50000 non-null  int64 
 8   gearbox              47320 non-null  object
 9   powerPS              50000 non-null  int64 
 10  model                47242 non-null  object
 11  odometer             50000 non-null  object
 12  monthOfRegistration  50000 non-null  int64 
 13  fuelType             45518 non-null  object
 14  brand                50000 non-null  object
 15  notRepairedDamage    40171 non-null  object
 16  date

##### missing data for `vehicleType`, `gearbox`, `model`, `fuelType`, `notRepairedDamage`
##### `brand`= American make

### 2. clean column names and values

In [5]:
# (1) CamelCase to snake_case, (2) descriptive names

autos.columns

Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
       'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
       'odometer', 'monthOfRegistration', 'fuelType', 'brand',
       'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
       'lastSeen'],
      dtype='object')

In [6]:
new_cols = ['date_crawled', 'car_name', 'seller_type', 'offer_type', 'price', 'ab_test','vehicle_type', 
            'registration_year', 'transmission', 'power_in_ps', 'model','mileage_km', 'registration_month', 
            'fuel_type', 'make','damage_not_repaired', 'date_created', 'num_pictures', 'postal_code',
            'last_seen_online']

autos.columns = new_cols
autos


Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,date_created,num_pictures,postal_code,last_seen_online
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,"$5,000",control,bus,2004,manuell,158,andere,"150,000km",3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,"$8,500",control,limousine,1997,automatik,286,7er,"150,000km",6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08
2,2016-03-26 18:57:24,Volkswagen_Golf_1.6_United,privat,Angebot,"$8,990",test,limousine,2009,manuell,102,golf,"70,000km",7,benzin,volkswagen,nein,2016-03-26 00:00:00,0,35394,2016-04-06 20:15:37
3,2016-03-12 16:58:10,Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan...,privat,Angebot,"$4,350",control,kleinwagen,2007,automatik,71,fortwo,"70,000km",6,benzin,smart,nein,2016-03-12 00:00:00,0,33729,2016-03-15 03:16:28
4,2016-04-01 14:38:50,Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg...,privat,Angebot,"$1,350",test,kombi,2003,manuell,0,focus,"150,000km",7,benzin,ford,nein,2016-04-01 00:00:00,0,39218,2016-04-01 14:38:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2016-03-27 14:38:19,Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon,privat,Angebot,"$24,900",control,limousine,2011,automatik,239,q5,"100,000km",1,diesel,audi,nein,2016-03-27 00:00:00,0,82131,2016-04-01 13:47:40
49996,2016-03-28 10:50:25,Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+...,privat,Angebot,"$1,980",control,cabrio,1996,manuell,75,astra,"150,000km",5,benzin,opel,nein,2016-03-28 00:00:00,0,44807,2016-04-02 14:18:02
49997,2016-04-02 14:44:48,Fiat_500_C_1.2_Dualogic_Lounge,privat,Angebot,"$13,200",test,cabrio,2014,automatik,69,500,"5,000km",11,benzin,fiat,nein,2016-04-02 00:00:00,0,73430,2016-04-04 11:47:27
49998,2016-03-08 19:25:42,Audi_A3_2.0_TDI_Sportback_Ambition,privat,Angebot,"$22,900",control,kombi,2013,manuell,150,a3,"40,000km",11,diesel,audi,nein,2016-03-08 00:00:00,0,35683,2016-04-05 16:45:07


### 3. initial exploration and cleaning

In [7]:
# (1) cols numeric data, clean and convert to int, (2) drop cols same values 

autos.describe(include='all')

Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,date_created,num_pictures,postal_code,last_seen_online
count,50000,50000,50000,50000,50000,50000,44905,50000.0,47320,50000.0,47242,50000,50000.0,45518,50000,40171,50000,50000.0,50000.0,50000
unique,48213,38754,2,2,2357,2,8,,2,,245,13,,7,40,2,76,,,39481
top,2016-03-19 17:36:18,Ford_Fiesta,privat,Angebot,$0,test,limousine,,manuell,,golf,"150,000km",,benzin,volkswagen,nein,2016-04-03 00:00:00,,,2016-04-07 06:17:27
freq,3,78,49999,49999,1421,25756,12859,,36993,,4024,32424,,30107,10687,35232,1946,,,8
mean,,,,,,,,2005.07328,,116.35592,,,5.72336,,,,,0.0,50813.6273,
std,,,,,,,,105.712813,,209.216627,,,3.711984,,,,,0.0,25779.747957,
min,,,,,,,,1000.0,,0.0,,,0.0,,,,,0.0,1067.0,
25%,,,,,,,,1999.0,,70.0,,,3.0,,,,,0.0,30451.0,
50%,,,,,,,,2003.0,,105.0,,,6.0,,,,,0.0,49577.0,
75%,,,,,,,,2008.0,,150.0,,,9.0,,,,,0.0,71540.0,


#### (1) cols numeric data, clean and convert to int
clean data:

- mileage_km(odometer)          need to remove , and km, dtype=object
- date_crawled                  format=2016-03-26 17:47:46, convert to datetime, dtype=object
- price                         need to remove $ and , dtype=object
- registration_year             dtype=int64
- make                          convert to string, dtype=object

In [8]:
# price: remove $ and , and convert to int

autos['price'] = autos['price'].str.replace('$', '')
autos['price'] = autos['price'].str.replace(',', '').astype(int) 

In [9]:
autos['price'][:2]

0    5000
1    8500
Name: price, dtype: int64

In [10]:
# mileage_km (odometer): remove km and , and convert to int

autos['mileage_km'] = autos['mileage_km'].str.replace('km', '')
autos['mileage_km'] = autos['mileage_km'].str.replace(',', '').astype(int)

In [11]:
autos['mileage_km'][:2]

0    150000
1    150000
Name: mileage_km, dtype: int64

In [12]:
autos[:2]

Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,date_created,num_pictures,postal_code,last_seen_online
0,2016-03-26 17:47:46,Peugeot_807_160_NAVTECH_ON_BOARD,privat,Angebot,5000,control,bus,2004,manuell,158,andere,150000,3,lpg,peugeot,nein,2016-03-26 00:00:00,0,79588,2016-04-06 06:45:54
1,2016-04-04 13:38:56,BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik,privat,Angebot,8500,control,limousine,1997,automatik,286,7er,150000,6,benzin,bmw,nein,2016-04-04 00:00:00,0,71034,2016-04-06 14:45:08


#### (2) drop cols same values 
cols with low unique:
- seller_type 2
- offer_type 2
- ab_test 2
- transmission 2
- fuel_type 7
- damage_not_repaired 2

In [13]:
#drop columns?

### 4. exploring mileage_km (odometer) and prive cols

In [14]:
# data quality check: mileage_km

autos['mileage_km'].unique().shape  #(13,)
autos['mileage_km'].describe()      # min 5,000 max 150,000
autos['mileage_km'].value_counts().sort_index(ascending=False)  # 967 cars w/500 km, 32,424 cars w/150,000


mileage_km
150000    32424
125000     5170
100000     2169
90000      1757
80000      1436
70000      1230
60000      1164
50000      1027
40000       819
30000       789
20000       784
10000       264
5000        967
Name: count, dtype: int64

In [15]:
# data quality check: price

autos['price'].unique().shape  #(2357,)
autos['price'].describe()      # min 0.000000e+00, max 1.000000e+08
autos['price'].value_counts().sort_index(ascending=True) # 1,421 cars w/0 price

price
0           1421
1            156
2              3
3              1
5              2
            ... 
10000000       1
11111111       2
12345678       3
27322222       1
99999999       1
Name: count, Length: 2357, dtype: int64

In [32]:
# replace price=0 with NaN


price_bool = autos['price'] == 0
price_zero = autos.loc[price_bool]
price_zero


Unnamed: 0,date_crawled,car_name,seller_type,offer_type,price,ab_test,vehicle_type,registration_year,transmission,power_in_ps,model,mileage_km,registration_month,fuel_type,make,damage_not_repaired,date_created,num_pictures,postal_code,last_seen_online
27,2016-03-27 18:45:01,Hat_einer_Ahnung_mit_Ford_Galaxy_HILFE,privat,Angebot,0,control,,2005,,0,,150000,0,,ford,,2016-03-27 00:00:00,0,66701,2016-03-27 18:45:01
71,2016-03-28 19:39:35,Suche_Opel_Astra_F__Corsa_oder_Kadett_E_mit_Re...,privat,Angebot,0,control,,1990,manuell,0,,5000,0,benzin,opel,,2016-03-28 00:00:00,0,4552,2016-04-07 01:45:48
80,2016-03-09 15:57:57,Nissan_Primera_Hatchback_1_6_16v_73_Kw___99Ps_...,privat,Angebot,0,control,coupe,1999,manuell,99,primera,150000,3,benzin,nissan,ja,2016-03-09 00:00:00,0,66903,2016-03-09 16:43:50
87,2016-03-29 23:37:22,Bmw_520_e39_zum_ausschlachten,privat,Angebot,0,control,,2000,,0,5er,150000,0,,bmw,,2016-03-29 00:00:00,0,82256,2016-04-06 21:18:15
99,2016-04-05 09:48:54,Peugeot_207_CC___Cabrio_Bj_2011,privat,Angebot,0,control,cabrio,2011,manuell,0,2_reihe,60000,7,diesel,peugeot,nein,2016-04-05 00:00:00,0,99735,2016-04-07 12:17:34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49884,2016-03-11 13:55:30,Audi_a6_2.5l__Schnaeppchen_nur_heute,privat,Angebot,0,test,kombi,1999,manuell,150,a6,150000,11,diesel,audi,,2016-03-11 00:00:00,0,27711,2016-03-12 03:17:08
49943,2016-03-16 20:46:08,Opel_astra,privat,Angebot,0,control,,2016,manuell,101,astra,150000,8,benzin,opel,,2016-03-16 00:00:00,0,89134,2016-03-17 19:44:20
49960,2016-03-25 22:51:55,Ford_KA_zu_verschenken_***Reserviert***,privat,Angebot,0,control,kleinwagen,1999,manuell,60,ka,150000,6,benzin,ford,,2016-03-25 00:00:00,0,34355,2016-03-25 22:51:55
49974,2016-03-20 10:52:31,Golf_1_Cabrio_Tuev_Neu_viele_Extras_alles_eing...,privat,Angebot,0,control,cabrio,1983,manuell,70,golf,150000,2,benzin,volkswagen,nein,2016-03-20 00:00:00,0,8209,2016-03-27 19:48:16
