## Importing / Installing packages

In [2]:
import math #mathematical computation
import pandas as pd #dataframe
import numpy as np #mathematical computations
import matplotlib.pyplot as plt #visualization
import matplotlib
import joblib #saving the model
import seaborn as sns #visualization
import json #exporting columns
import pickle #saving the model
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression #Linear Regression
from sklearn.linear_model import Lasso #Lasso Regression
from sklearn.tree import DecisionTreeRegressor #Decision Tree Regression
from sklearn.ensemble import RandomForestRegressor #Random Forest Regression
from xgboost import XGBRegressor # XGBoost Regression
from sklearn.model_selection import train_test_split #Splitting the dataset into training and testing
from sklearn.model_selection import ShuffleSplit #Random shuffling
from sklearn.model_selection import cross_val_score #Score cross validation
from sklearn.model_selection import cross_validate #Score cross validation
from sklearn.model_selection import GridSearchCV #Hyper parameter tuning
from statistics import stdev
from warnings import simplefilter #Filtering warnings

In [3]:
dataset = 'Dataset/Initial Dataset (corrected)/corrected_data_bikroy_com.csv'

In [4]:
df1 = pd.read_csv(dataset)

In [5]:
df1.head()

Unnamed: 0,car_name,brand,car_model,model_year,transmission,body_type,fuel_type,engine_capacity,kilometers_run,price
0,Toyota Fielder 2011,Toyota,Fielder,2016,Automatic,Estate,CNG,1500,45852,1200000
1,Toyota Noah 2005,Toyota,Noah,2010,Automatic,Hatchback,CNG,1998,219630,1200000
2,Toyota LiteAce 2001,Toyota,LiteAce,2001,Manual,,CNG,1800,102000,165000
3,Maruti Suzuki Vitara Brezza 1994,Maruti Suzuki,Vitara Brezza,1994,Manual,,CNG,1600,113000,230000
4,Toyota Starlet good 1992,Toyota,Starlet,1992,Automatic,Saloon,CNG,1300,15000,330000


In [6]:
df1.columns

Index(['car_name', 'brand', 'car_model', 'model_year', 'transmission',
       'body_type', 'fuel_type', 'engine_capacity', 'kilometers_run', 'price'],
      dtype='object')

In [7]:
df1.nunique()

car_name           1108
brand                26
car_model           123
model_year           35
transmission          2
body_type             7
fuel_type            24
engine_capacity      51
kilometers_run      640
price               519
dtype: int64

## Exploratory Data Analysis

In [8]:
df2 = df1.drop(['car_name'],axis=1)
df2.head()

Unnamed: 0,brand,car_model,model_year,transmission,body_type,fuel_type,engine_capacity,kilometers_run,price
0,Toyota,Fielder,2016,Automatic,Estate,CNG,1500,45852,1200000
1,Toyota,Noah,2010,Automatic,Hatchback,CNG,1998,219630,1200000
2,Toyota,LiteAce,2001,Manual,,CNG,1800,102000,165000
3,Maruti Suzuki,Vitara Brezza,1994,Manual,,CNG,1600,113000,230000
4,Toyota,Starlet,1992,Automatic,Saloon,CNG,1300,15000,330000


In [9]:
df2.isnull().any()

brand              False
car_model          False
model_year         False
transmission       False
body_type           True
fuel_type          False
engine_capacity    False
kilometers_run     False
price              False
dtype: bool

In [10]:
df2.isnull().sum()

brand               0
car_model           0
model_year          0
transmission        0
body_type          18
fuel_type           0
engine_capacity     0
kilometers_run      0
price               0
dtype: int64

In [11]:
# Dropping 18 Nan Value Containing Rows
df2 = df2.dropna(how='any',axis=0) 
df2.isnull().sum()

brand              0
car_model          0
model_year         0
transmission       0
body_type          0
fuel_type          0
engine_capacity    0
kilometers_run     0
price              0
dtype: int64

In [12]:
df2.shape
df2.head(20)

Unnamed: 0,brand,car_model,model_year,transmission,body_type,fuel_type,engine_capacity,kilometers_run,price
0,Toyota,Fielder,2016,Automatic,Estate,CNG,1500,45852,1200000
1,Toyota,Noah,2010,Automatic,Hatchback,CNG,1998,219630,1200000
4,Toyota,Starlet,1992,Automatic,Saloon,CNG,1300,15000,330000
5,Toyota,Carina,1998,Automatic,Saloon,CNG,1500,60720,200000
6,Toyota,Corolla,2009,Automatic,Saloon,CNG,1500,70000,950000
7,Mitsubishi,Pajero,2000,Manual,SUV / 4x4,CNG,2350,750000,960000
8,Toyota,Axio,2016,Automatic,Saloon,"CNG, Hybrid",1500,96000,1830000
9,Toyota,Other Model,1996,Automatic,Estate,"CNG, Octane",1331,120000,350000
10,Toyota,Fielder,2001,Automatic,Estate,"CNG, Octane",1500,75000,799000
11,Toyota,Fielder,2001,Automatic,Estate,"CNG, Octane",1500,79250,785000


### Checking for data consistency

In [13]:
df2.nunique()

brand               24
car_model          120
model_year          35
transmission         2
body_type            7
fuel_type           24
engine_capacity     50
kilometers_run     632
price              513
dtype: int64

In [14]:
df2.car_model.unique()

array(['Fielder', 'Noah', 'Starlet', 'Carina', 'Corolla', 'Pajero',
       'Axio', 'Other Model', 'Succeed', 'Probox', 'Spacio', 'Wish',
       'Raum', 'Starlet Soleil', 'CR-V', 'Alto', 'ist', 'Rush', 'LiteAce',
       'TownAce', 'Hiace', 'Sienta', 'l300', 'MPV', 'Dyna', 'Avanza',
       'Urvan', 'Sprinter', 'Mark II', 'Corsa', 'Corona', 'Vista',
       'Allion', 'Premio', 'Crown', 'Lancer', 'X Assista', 'Fit', 'Sunny',
       'WagonR', 'Pathfinder', 'Prado', 'V6', 'Cami', 'Carryboy',
       'Kluger', 'RAV4', 'CX-7', 'Hilux', 'Estima', 'l200', 'H1',
       'Coaster', '5 Series', 'Indigo Ecs', 'Terrano', 'X-Trail',
       'Land Cruiser', 'Eco Sport', 'Kyron', 'Harrier', 'Prius', 'Aqua',
       'Esquire', 'Grace', 'Axela', 'CR-Z', 'C-HR', 'Alphard', 'Camry',
       'Vezel', 'NX', 'Outlandar', 'S660', 'RX-8', 'Note', 'Vitz',
       'Alto 800', 'Passo', 'RunX', 'Tiida', 'Swift', 'Spark', 'Ractis',
       'HR-V', 'MR2', 'City', '320i', 'Civic', 'Bluebird', 'GLX',
       'Sonata', 'Accent', 

In [15]:
df2.transmission.unique()

array(['Automatic', 'Manual'], dtype=object)

In [16]:
df2.body_type.unique()

array(['Estate', 'Hatchback', 'Saloon', 'SUV / 4x4', 'MPV', 'Convertible',
       'Coupé/Sports'], dtype=object)

In [17]:
df2.fuel_type.unique()

array(['CNG', 'CNG, Hybrid', 'CNG, Octane', 'Diesel', 'Diesel, Petrol',
       'Hybrid', 'Hybrid, Octane', 'Hybrid, Octane, LPG', 'LPG', 'Octane',
       'Octane, LPG', 'Octane, Other fuel type', 'Petrol', 'Petrol, CNG',
       'Petrol, CNG, Octane', 'Petrol, CNG, Octane, LPG',
       'Petrol, Hybrid', 'Petrol, Hybrid, Octane',
       'Petrol, Hybrid, Octane, LPG', 'Petrol, LPG', 'Petrol, Octane',
       'Petrol, Octane, LPG', 'Petrol, Octane, Other fuel type',
       'Petrol, Other fuel type'], dtype=object)

In [18]:
df2.engine_capacity.unique()

array([1500, 1998, 1300, 2350, 1331, 1800, 1000, 2000,  150, 2500, 1781,
       1996, 1990, 2400, 1600, 1490, 1900, 1830, 1586, 1580, 1984, 1061,
       1496, 3300, 3500, 2260, 4200, 2200, 3000, 2700, 2389, 2779, 3700,
       4500, 1797, 2015,  660, 2367,  800, 1329, 1100, 1590, 1200, 1320,
       1400, 1499, 3400, 2972, 2362, 2490])