## Data  Exploration 

In [23]:
import pandas as pd
import  numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from pandas_profiling import ProfileReport

import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection  import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute  import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from  sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn import set_config
import pickle
set_config(display='diagram')

In [2]:
car=pd.read_csv('./data/data.csv')

In [3]:
car.shape

(11914, 16)

In [4]:
car.sample(5)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
3679,Mercedes-Benz,E-Class,2015,premium unleaded (required),329.0,6.0,AUTOMATIC,all wheel drive,4.0,"Luxury,Performance",Midsize,Sedan,28,20,617,64850
6639,Dodge,Magnum,2007,regular unleaded,250.0,6.0,AUTOMATIC,all wheel drive,4.0,,Large,Wagon,22,15,1851,29860
3554,Land Rover,Defender,1997,regular unleaded,182.0,8.0,AUTOMATIC,four wheel drive,2.0,Luxury,Compact,2dr SUV,14,12,258,39669
6412,Subaru,Legacy,2017,regular unleaded,175.0,4.0,AUTOMATIC,all wheel drive,4.0,,Midsize,Sedan,34,25,640,25995
7091,Ford,Mustang,2017,premium unleaded (recommended),435.0,8.0,MANUAL,rear wheel drive,2.0,Performance,Midsize,Coupe,25,15,5657,36645


In [5]:
car.isnull().sum()

Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              69
Engine Cylinders       30
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

In [6]:
car['mileage KML']=round(((car['highway MPG']+car['city mpg'])/2)/2.352,2)
car=car.drop(['Driven_Wheels','highway MPG','city mpg','Engine Cylinders','Engine Fuel Type'], axis=1)

In [7]:
X_train, X_test, y_train, y_test=train_test_split(car.drop(columns=['MSRP','Make','Model','Market Category']), car['MSRP'], test_size=0.2, random_state=42)

In [8]:
X_train

Unnamed: 0,Year,Engine HP,Transmission Type,Number of Doors,Vehicle Size,Vehicle Style,Popularity,mileage KML
3181,2016,265.0,AUTOMATIC,4.0,Large,Sedan,1624,11.27
5357,2017,449.0,AUTOMATIC,4.0,Large,4dr SUV,617,6.80
4874,2016,173.0,AUTOMATIC,2.0,Compact,Coupe,1720,12.54
8102,1993,180.0,MANUAL,2.0,Large,Regular Cab Pickup,1851,5.74
10400,2008,172.0,AUTOMATIC,2.0,Compact,2dr Hatchback,1439,8.72
...,...,...,...,...,...,...,...,...
11284,2014,181.0,AUTOMATIC,4.0,Midsize,Wagon,2031,9.78
5191,2009,219.0,AUTOMATIC,4.0,Midsize,Sedan,210,9.14
5390,2016,220.0,AUTOMATED_MANUAL,2.0,Compact,2dr Hatchback,873,12.33
860,2009,260.0,AUTOMATIC,4.0,Midsize,Wagon,376,9.35


In [9]:
trf1=ColumnTransformer([
    ('impute_engine_hp', SimpleImputer(),[1]),
    ('impute_doors', SimpleImputer(strategy='most_frequent'),[3]),
    ('impute_market_category', SimpleImputer(strategy='constant',fill_value='Data Not Available'),[4])  
], remainder='passthrough')

trf1

In [10]:
# car_mod1['Vehicle Size'].unique() 
# car_mod1['Transmission Type'].unique()
# car_mod1['Market Category'].unique()
# car_mod1['Vehicle Style'].unique() 


# One hot encoding
X_train.head()

Unnamed: 0,Year,Engine HP,Transmission Type,Number of Doors,Vehicle Size,Vehicle Style,Popularity,mileage KML
3181,2016,265.0,AUTOMATIC,4.0,Large,Sedan,1624,11.27
5357,2017,449.0,AUTOMATIC,4.0,Large,4dr SUV,617,6.8
4874,2016,173.0,AUTOMATIC,2.0,Compact,Coupe,1720,12.54
8102,1993,180.0,MANUAL,2.0,Large,Regular Cab Pickup,1851,5.74
10400,2008,172.0,AUTOMATIC,2.0,Compact,2dr Hatchback,1439,8.72


In [11]:
trf2=ColumnTransformer([
    ('ohe_tt_vs_vs',OneHotEncoder(sparse=False, drop='first'),[2,4,5])
], remainder='passthrough')

trf2

In [12]:
trf3=ColumnTransformer([
    ('Scale',MinMaxScaler(),slice(0,10))
], remainder='passthrough')

trf3

In [13]:
trf4=SelectKBest(score_func=chi2,k=8)

trf4

In [14]:
trf5=LinearRegression()

trf5

In [15]:
pipe=Pipeline([
    ('Simple_Imputing',trf1),
    ('One_Hot_Encoding',trf2),
    ('Scaling',trf3),
    ('Feature_selection', trf4),
    ('Training_the_model',trf5)
])

pipe

In [16]:
pipe.fit(X_train,y_train)

In [17]:
y_pred=pipe.predict(X_test)
y_pred

array([49766.92062921, 16932.32339002, 39099.45282666, ...,
        9938.23974438, 15010.58271064, 39251.23843304])

In [18]:
r2_score(y_pred, y_test)

0.4171510274400685

## Car details fetcher:

In [19]:
def car_details(car_name):
    return car[(car['Make']==car_name)]

In [20]:
car_details('Plymouth')

Unnamed: 0,Make,Model,Year,Engine HP,Transmission Type,Number of Doors,Market Category,Vehicle Size,Vehicle Style,Popularity,MSRP,mileage KML
1189,Plymouth,Acclaim,1993,100.0,MANUAL,4.0,,Compact,Sedan,535,2000,10.84
1190,Plymouth,Acclaim,1994,106.0,AUTOMATIC,4.0,,Compact,Sedan,535,2000,9.35
1191,Plymouth,Acclaim,1995,100.0,AUTOMATIC,4.0,,Compact,Sedan,535,2000,9.57
2018,Plymouth,Breeze,1998,132.0,MANUAL,4.0,,Midsize,Sedan,535,2000,11.90
2019,Plymouth,Breeze,1998,132.0,MANUAL,4.0,,Midsize,Sedan,535,2000,11.90
...,...,...,...,...,...,...,...,...,...,...,...,...
11422,Plymouth,Voyager,1999,158.0,AUTOMATIC,4.0,Flex Fuel,Midsize,Passenger Minivan,535,2166,8.08
11423,Plymouth,Voyager,1999,158.0,AUTOMATIC,4.0,Flex Fuel,Midsize,Passenger Minivan,535,2110,8.08
11424,Plymouth,Voyager,1999,150.0,AUTOMATIC,3.0,,Midsize,Passenger Minivan,535,2000,8.93
11425,Plymouth,Voyager,2000,158.0,AUTOMATIC,4.0,Flex Fuel,Midsize,Passenger Minivan,535,2317,8.29


In [21]:
pickle.dump(pipe,open('predict_pipe.pkl','wb'))
pickle.dump(car,open('car.pkl','wb'))

In [24]:
profile=ProfileReport(car,title='Car Data Analysis',explorative=True)

profile.to_file('analysis.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]