In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/processed/data.csv')

In [3]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
0,Yamaha Fazer Dlx Standard,2014,12600,1st Owner,Raigad,34999
1,Yamaha RX135 Standard,1996,14500,2nd Owner,Madurai,85000
2,Bajaj Pulsar 150 [2001-2011] Kick Start,2007,51000,2nd Owner,Bangalore,25000
3,Royal Enfield Electra Twinspark Standard,2013,57500,1st Owner,Bangalore,100000
4,Hero Honda Glamour Alloy Drum Self,2006,69569,2nd Owner,Azamgarh,21000


## Feature Engineering

### Handle model_name

In [4]:
df.model_name[0].split(' ')

['', 'Yamaha', 'Fazer', 'Dlx', 'Standard', '']

In [5]:
# Let's simplify model_name variable
# Take only first 3 words from the value

def build_model_name(val):
    splits = val.strip().split(' ')

    if len(splits) > 3:
        return f"{splits[0]} {splits[1]} {splits[2]}"
    else:
        return val

df['model_name'] = df.model_name.apply(build_model_name)


In [6]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
0,Yamaha Fazer Dlx,2014,12600,1st Owner,Raigad,34999
1,Yamaha RX135 Standard,1996,14500,2nd Owner,Madurai,85000
2,Bajaj Pulsar 150,2007,51000,2nd Owner,Bangalore,25000
3,Royal Enfield Electra,2013,57500,1st Owner,Bangalore,100000
4,Hero Honda Glamour,2006,69569,2nd Owner,Azamgarh,21000


In [7]:
print("Model name has unique values:",len(df.model_name.unique()))

Model name has unique values: 219


In [8]:
top_models = df.model_name.value_counts()[:40].index

### Take only top models as main categories & make other categories as "others"

In [9]:
df['model_name'] = df.model_name.apply(lambda x: x if x in list(top_models) else "others")

In [10]:
df.tail()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
1322,others,2020,6000,1st Owner,Hyderabad,65000
1323,others,2018,12000,1st Owner,Srinagar,65000
1324,Yamaha YZF R15,2016,33000,2nd Owner,Gondia,80000
1325,Honda CB Unicorn,2011,50000,1st Owner,Mumbai,40000
1326,Bajaj Avenger Street,2016,23600,1st Owner,Bangalore,60000


### Handle location categories

In [11]:
top_locations = df.location.value_counts()[:25].index
top_locations

Index(['Bangalore ', 'Delhi ', 'Pune ', 'Mumbai ', 'Hyderabad ', 'Chennai ',
       'Ahmedabad ', 'Gurgaon ', 'Kolkata ', 'Thane ', 'Ghaziabad ',
       'Lucknow ', 'Bhopal ', 'Jaipur ', 'Noida ', 'Navi Mumbai ',
       'Vijaywada ', 'Nagpur ', 'Surat ', 'Indore ', 'Bhubaneswar ', 'Patna ',
       'Faridabad ', 'Guntur ', 'Coimbatore '],
      dtype='object')

### Take only top locations as main categories & make other categories as "others"

In [12]:
df['location'] = df.location.apply(lambda x: x if x in top_locations else "others")

In [13]:
df.tail()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
1322,others,2020,6000,1st Owner,Hyderabad,65000
1323,others,2018,12000,1st Owner,others,65000
1324,Yamaha YZF R15,2016,33000,2nd Owner,others,80000
1325,Honda CB Unicorn,2011,50000,1st Owner,Mumbai,40000
1326,Bajaj Avenger Street,2016,23600,1st Owner,Bangalore,60000


In [14]:
df.owner.unique()[::-1]

array(['5th Owner', '4th Owner', '3rd Owner', '2nd Owner', '1st Owner'],
      dtype=object)

## Split Data

In [15]:
X = df.drop('price',axis=1)
y = df['price']

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1234)

## Create column transformer for encoding

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder


In [18]:
df.owner.unique()[::-1]

array(['5th Owner', '4th Owner', '3rd Owner', '2nd Owner', '1st Owner'],
      dtype=object)

In [24]:
tr1 = ColumnTransformer([
    ("model_name_ohe",OneHotEncoder(dtype=np.int16),[1]),
],remainder='passthrough')

tr2 = ColumnTransformer([
    ("owner_ordinal_enc",OrdinalEncoder(categories='auto',handle_unknown='ignore',dtype=np.int16),[4]),
],remainder='passthrough')

tr3 = ColumnTransformer([
    ("owner_ohe",OneHotEncoder(dtype=np.int16),[1]),
],remainder='passthrough')

## Model Building

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [23]:
lr_pipe = Pipeline([
    ('tr1',tr1),
    ('tr2',tr2),
    ('tr3',tr3),
    ('linear_regressor',LinearRegression())
])
lr_pipe.fit(X_train,y_train)

ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.