## Import Libaries

In [70]:
import pickle
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

# Own modules for data processing
from data_processor_module import DataProcess

In [71]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Data Processing
In the next code cells, I try to make the test dataset **have the same attributes** as the training dataset.

In [72]:
test_df= pd.read_csv('../assets/data/test-data.csv', index_col=0)

In [73]:
test_df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,25.27 Lakh
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,


In [74]:
df = DataProcess(test_df)
df.head()

Step 0: Take Brand from Name
Step 0 Done
Step 1: Convert Year to Age
Step 1 Done
Step 2: Convert Mileage to kmpl
Step 2 Done
Step 3: Convert Engine and Power to float
Step 3 Done
Step 4: Convert owner type to int
Step 4 Done
Step 5: One hot encoding
Step 5 Done
Step 6: Drop columns
Step 6 Done


Unnamed: 0,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,Age,Ahmedabad,Bangalore,Chennai,...,Mitsubishi,Nissan,OpelCorsa,Porsche,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo
0,40929,3,25.49,998.0,58.2,4.0,5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,54493,2,24.7,796.0,47.3,5.0,6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,34000,3,13.68,2393.0,147.8,7.0,2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,139000,3,23.59,1364.0,,5.0,7,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,29000,3,18.5,1197.0,82.85,5.0,5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
# Add column {'Lamborghini', 'Force', 'Smart', 'Ambassador'} with 0 values
df['Lamborghini'] = 0
df['Force'] = 0
df['Smart'] = 0
df['Ambassador'] = 0

# Remove column {'OpelCorsa', 'Hindustan'}
df = df.drop(['OpelCorsa', 'Hindustan'], axis=1)

In [76]:
df.head()

Unnamed: 0,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,Age,Ahmedabad,Bangalore,Chennai,...,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo,Lamborghini,Force,Smart,Ambassador
0,40929,3,25.49,998.0,58.2,4.0,5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
1,54493,2,24.7,796.0,47.3,5.0,6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
2,34000,3,13.68,2393.0,147.8,7.0,2,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0,0
3,139000,3,23.59,1364.0,,5.0,7,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0,0
4,29000,3,18.5,1197.0,82.85,5.0,5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0


## Model

### Load model and take input features

In [77]:
# Load the model
model = pickle.load(open('./model.pkl', 'rb'))

In [78]:
features = ['Kilometers_Driven', 'Owner_Type', 'Mileage', 'Engine', 'Power',
       'Seats', 'Age', 'Ahmedabad', 'Bangalore', 'Chennai', 'Coimbatore',
       'Delhi', 'Hyderabad', 'Jaipur', 'Kochi', 'Kolkata', 'Mumbai', 'Pune',
       'CNG', 'Diesel', 'LPG', 'Petrol', 'Automatic', 'Manual', 'Ambassador',
       'Audi', 'BMW', 'Bentley', 'Chevrolet', 'Datsun', 'Fiat', 'Force',
       'Ford', 'Honda', 'Hyundai', 'ISUZU', 'Isuzu', 'Jaguar', 'Jeep',
       'Lamborghini', 'Land', 'Mahindra', 'Maruti', 'Mercedes-Benz', 'Mini',
       'Mitsubishi', 'Nissan', 'Porsche', 'Renault', 'Skoda', 'Smart', 'Tata',
       'Toyota', 'Volkswagen', 'Volvo']
df = df[features]

In [79]:
#  Find NaN values
for column in df.columns:
    if df[column].isnull().sum() > 0:
        print(column, df[column].isnull().sum())

Engine 10
Power 32
Seats 11


In [80]:
# Replace NaN with mean value
NaN_col = ['Engine', 'Power', 'Seats']
for column in NaN_col:
    df[column].fillna(df[column].mean(), inplace=True)

In [83]:
# Ensure that there are all float values
df = df.astype(float)

### Predict `test.csv` 

In [84]:
# Predict the price
price = model.predict(df[features])
price

array([ 2.5333266,  3.3138285, 16.874125 , ...,  3.1113513,  4.948583 ,
       17.365046 ], dtype=float32)

Save it to `submit.csv`

In [85]:
dict = {'Price': price}
result = pd.DataFrame(dict)
result.to_csv('../assets/data/submit.csv', index=False)