In [19]:
# Import required libraries
import math 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [20]:
pd.set_option('max_columns', 100)

In [21]:
# Load the dataset
dataset = pd.read_csv("data_after_cleaning.csv")

In [22]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119418 entries, 0 to 119417
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    119418 non-null  int64  
 1   price         119418 non-null  int64  
 2   year          119418 non-null  float64
 3   model         119418 non-null  object 
 4   condition     83010 non-null   object 
 5   cylinders     94740 non-null   object 
 6   fuel          119418 non-null  object 
 7   odometer      119418 non-null  float64
 8   transmission  119418 non-null  object 
 9   drive         119418 non-null  object 
 10  type          119418 non-null  object 
 11  paint_color   119418 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 10.9+ MB


In [23]:
# Removing unnamed 1st column
dataset.drop(dataset.columns[0], axis=1, inplace=True)

In [24]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119418 entries, 0 to 119417
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         119418 non-null  int64  
 1   year          119418 non-null  float64
 2   model         119418 non-null  object 
 3   condition     83010 non-null   object 
 4   cylinders     94740 non-null   object 
 5   fuel          119418 non-null  object 
 6   odometer      119418 non-null  float64
 7   transmission  119418 non-null  object 
 8   drive         119418 non-null  object 
 9   type          119418 non-null  object 
 10  paint_color   119418 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 10.0+ MB


In [25]:
# Find all the non-numerical features
temp_features = dataset.columns.values.tolist()
features = []
for column_name in temp_features:
    if dataset[column_name].dtype == 'object':
        features.append(column_name)
print(features)

['model', 'condition', 'cylinders', 'fuel', 'transmission', 'drive', 'type', 'paint_color']


In [26]:
# https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/

# do label encoding
# # Converting from strings to numerics
# for column in features:
#     if dataset[column].dtype == 'object':
#         le = LabelEncoder()
#         le.fit(list(dataset[column].astype(str).values))
#         dataset[column] = le.transform(list(dataset[column].astype(str).values))

# do one hot encoding
for column in features:
    column = pd.get_dummies(dataset[column], drop_first=True)
    dataset = pd.concat([dataset,column],axis=1)
dataset = dataset.drop(columns = features)
        
dataset.head(5)

Unnamed: 0,price,year,odometer,1500,1500 4x4,1500 big horn,1500 crew cab,1500 quad cab,1500 slt,1500 sport 4x4 1/2 ton,200,2500,2500 4x4,2500 crew cab,2500 slt,3,3 series,3 series 328i convertible 2d,3-series,300,325i,328,328i,328xi,335i,3500,350z,370z coupe 2d,4 series 428i coupe 2d,4-runner,4500,4runner,4runner limited,4runner sr5,4runner sr5 4x4 gas suv,5 series,5-series,500,528i,535i,5500,6,7 series,HUMMER H2,HUMMER H3,International 4300,Isuzu NPR,Isuzu NPR HD,Porsche Cayenne,Scion tC,...,yaris,yukon,yukon denali,yukon slt,yukon xl,yukon xl denali,z4,fair,good,like new,new,salvage,12 cylinders,3 cylinders,4 cylinders,5 cylinders,6 cylinders,8 cylinders,other,electric,gas,hybrid,other.1,manual,other.2,fwd,rwd,bus,convertible,coupe,hatchback,mini-van,offroad,other.3,pickup,sedan,truck,van,wagon,blue,brown,custom,green,grey,orange,purple,red,silver,white,yellow
0,8500,2005.0,62800.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,24930,2017.0,32989.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,13499,2015.0,67257.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,3450,2000.0,198200.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,17998,2014.0,73864.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [27]:
# Shuffle dataframe
dataset = dataset.sample(frac=1, random_state=1)
dataset.head(5)

Unnamed: 0,price,year,odometer,1500,1500 4x4,1500 big horn,1500 crew cab,1500 quad cab,1500 slt,1500 sport 4x4 1/2 ton,200,2500,2500 4x4,2500 crew cab,2500 slt,3,3 series,3 series 328i convertible 2d,3-series,300,325i,328,328i,328xi,335i,3500,350z,370z coupe 2d,4 series 428i coupe 2d,4-runner,4500,4runner,4runner limited,4runner sr5,4runner sr5 4x4 gas suv,5 series,5-series,500,528i,535i,5500,6,7 series,HUMMER H2,HUMMER H3,International 4300,Isuzu NPR,Isuzu NPR HD,Porsche Cayenne,Scion tC,...,yaris,yukon,yukon denali,yukon slt,yukon xl,yukon xl denali,z4,fair,good,like new,new,salvage,12 cylinders,3 cylinders,4 cylinders,5 cylinders,6 cylinders,8 cylinders,other,electric,gas,hybrid,other.1,manual,other.2,fwd,rwd,bus,convertible,coupe,hatchback,mini-van,offroad,other.3,pickup,sedan,truck,van,wagon,blue,brown,custom,green,grey,orange,purple,red,silver,white,yellow
106417,20955,2017.0,114818.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21863,15995,2017.0,31909.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
59511,14588,2013.0,130247.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
1563,15995,2010.0,145521.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
68268,8950,2014.0,98033.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [28]:
# Separating price column from dataset
target_column_name = 'price'
y = dataset[target_column_name]
x = dataset.drop([target_column_name], axis=1)

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(95534, 606) (23884, 606) (95534,) (23884,)
