In [1]:
### 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('bmw_pricing_challenge.csv')
df.shape

(4843, 18)

#### problem - model to predict car price (value/regression prediction)

- price of the car
 - brand
 - mileage
 - engine power/capacity
 - fuel type 
 - type car
 - maintainance cost 
 - offers/discount
 - color of the car
 - safety features (Category)
 - insurance 
 - resale value
 
    - mileage 
    - age of the car
    - insurance validity
    - condition (good/bad/worst)
    - model key
    - engine power/capcity
    - fuel type

In [3]:
df.head()

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at
0,BMW,118,140411,100,01-02-2012,diesel,black,convertible,True,True,False,False,True,True,True,False,11300,01-01-2018
1,BMW,M4,13929,317,01-04-2016,petrol,grey,convertible,True,True,False,False,False,True,True,True,69700,01-02-2018
2,BMW,320,183297,120,01-04-2012,diesel,white,convertible,False,False,False,False,True,False,True,False,10200,01-02-2018
3,BMW,420,128035,135,01-07-2014,diesel,red,convertible,True,True,False,False,True,True,True,True,25100,01-02-2018
4,BMW,425,97097,160,01-12-2014,diesel,silver,convertible,True,True,False,False,False,True,True,True,33400,01-04-2018


####  EDA
 - Univariate analysis
        - Missing handling
        - encoding 
        - outliers handling
        - normality check
 - Bivariate analysis   
        - correlation check
        - outliers

In [4]:
# Missing value check
df.isnull().sum()

maker_key            0
model_key            0
mileage              0
engine_power         0
registration_date    0
fuel                 0
paint_color          0
car_type             0
feature_1            0
feature_2            0
feature_3            0
feature_4            0
feature_5            0
feature_6            0
feature_7            0
feature_8            0
price                0
sold_at              0
dtype: int64

1. why do we need to fill missing values?
2. why should we not drop missing?
3. techiques to handle missing 
    - mean,median 
    - mode 
    - correlation imputation
    - fill with random numbers between N standard deviation
    - predictive imputation
    
gender(categorical.)
(31 % - M) (29 % F) (40% NaN)

Age (conti.)
70 % missing values    

### Encoding 
 - label encoding 
 - OHE

In [5]:
categorical_features = df.select_dtypes(['object']).columns
categorical_features

Index(['maker_key', 'model_key', 'registration_date', 'fuel', 'paint_color',
       'car_type', 'sold_at'],
      dtype='object')

In [6]:
categorical_features = ['model_key', 'fuel', 'paint_color','car_type']

In [7]:
df[categorical_features]

Unnamed: 0,model_key,fuel,paint_color,car_type
0,118,diesel,black,convertible
1,M4,petrol,grey,convertible
2,320,diesel,white,convertible
3,420,diesel,red,convertible
4,425,diesel,silver,convertible
...,...,...,...,...
4838,218 Gran Tourer,diesel,black,van
4839,218 Active Tourer,diesel,grey,van
4840,218 Gran Tourer,diesel,grey,van
4841,218 Active Tourer,diesel,brown,van


In [8]:
# OHE -> 
# Label encoding ->

In [9]:
df.fuel.unique() # label encoding

array(['diesel', 'petrol', 'hybrid_petrol', 'electro'], dtype=object)

In [10]:
# D - 1, P - 2, HP - 3, E - 4
# P - 1, D - 2, HP - 3, E - 4

In [11]:
fuel_list = []

for each in  df.fuel.unique():
    fuel_list.append(df[df.fuel== each]['price'].mean())
#     print(df[df.fuel== each]['price'].mean())

print(dict(zip(df.fuel.unique(), fuel_list)))

# plt.bar(x = fuel_list, height=df.fuel.unique())  
# plt.show()

{'diesel': 15846.110751993105, 'petrol': 14398.429319371728, 'hybrid_petrol': 37575.0, 'electro': 20966.666666666668}


In [12]:
# P - 1, D - 2, E- 3, HP - 4
df.fuel

0       diesel
1       petrol
2       diesel
3       diesel
4       diesel
         ...  
4838    diesel
4839    diesel
4840    diesel
4841    diesel
4842    diesel
Name: fuel, Length: 4843, dtype: object

In [13]:
df['fuel_e'] = df.fuel.map({'diesel':2, 'petrol':1, 'hybrid_petrol':4, 'electro':3})
df.head()

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at,fuel_e
0,BMW,118,140411,100,01-02-2012,diesel,black,convertible,True,True,False,False,True,True,True,False,11300,01-01-2018,2
1,BMW,M4,13929,317,01-04-2016,petrol,grey,convertible,True,True,False,False,False,True,True,True,69700,01-02-2018,1
2,BMW,320,183297,120,01-04-2012,diesel,white,convertible,False,False,False,False,True,False,True,False,10200,01-02-2018,2
3,BMW,420,128035,135,01-07-2014,diesel,red,convertible,True,True,False,False,True,True,True,True,25100,01-02-2018,2
4,BMW,425,97097,160,01-12-2014,diesel,silver,convertible,True,True,False,False,False,True,True,True,33400,01-04-2018,2


In [14]:
# plt.figure(figsize = (7,4))
# sns.boxplot(x = df.price, y = df.fuel)

In [15]:
# paint color

In [16]:
# car_type