**Data Science Regression Project: Predicting Vehicle Price**

In [None]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import zipfile

import os
from sklearn.model_selection import train_test_split 

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

*Load our data*

In [None]:
main_df = pd.read_csv('/kaggle/input/vehicles-price-2020-ukraine/vehicle_price(2020).csv')
print(f'data shape is {main_df.shape}')
main_df.head(2)

**Let's do something with features**

In [None]:
main_df['fuel'].value_counts()

In [None]:
main_df['brand'].nunique()

In [None]:
main_df['body'].nunique()

In [None]:
main_df.isnull().sum()

We replace Nan values in transmission columns  to 'other'

In [None]:
vehicle_df1 = main_df.copy()
vehicle_df1['transmission'] = vehicle_df1['transmission'].fillna('other')

In [None]:
vehicle_df1.isnull().sum()

In dataset we have Tesla vehicle. But Tesla does not uses gas, petrol or diesel. We have to change Nan to 0 

Another Nan data we drop out.

In [None]:
vehicle_df2 = vehicle_df1.copy()
vehicle_df2.loc[vehicle_df2.brand == 'tesla', 'fuel'] = 0
vehicle_df2 = vehicle_df2.dropna()

In [None]:
vehicle_df2.isnull().sum()

In [None]:
print(vehicle_df2.shape)
vehicle_df2.head(2)  

Count how much each brand has values.

In [None]:
brand_count = vehicle_df2['brand'].value_counts(ascending=False) 

In [None]:
plt.hist(brand_count)

Drop brand which has less 50 pieces.

In [None]:
brand_count_less_than_50 = brand_count[(brand_count < 50)]
brand_count_less_than_50

In [None]:
len(vehicle_df2.brand.unique())

In [None]:
vehicle_df2.brand = vehicle_df2.brand.apply(lambda x: 'other' if x in brand_count_less_than_50 else x)
len(vehicle_df2.brand.unique())

In [None]:
#plt.hist(vehicle_df2.brand)

In [None]:
brand_count[brand_count > 50]

Watch how many model and brand the same we have.

In [None]:
model_count = vehicle_df2['model'].value_counts(ascending=False) 
len(model_count)

In [None]:
plt.hist(model_count)

**Use One Hot Encoding For Brand**


In [None]:
brand_df = pd.get_dummies(vehicle_df2.brand)
print(brand_df.shape)
brand_df.head(2)

In [None]:
brand_df1 = brand_df.drop(['заз'], axis=1)
print(brand_df1.shape)
brand_df1.head(2)

In [None]:
vehicle_df3 = pd.concat([vehicle_df2, brand_df1], axis='columns')
vehicle_df3.head(2)

In [None]:
vehicle_df4 = vehicle_df3.drop(['brand'], axis=1)
print(vehicle_df4.shape)
vehicle_df4.head(2)

Use One Hot Encoding For Model

In [None]:
model_df1 = pd.get_dummies(vehicle_df4.model)
print(model_df1.shape)
model_df1.head(2)

In [None]:
model_df2 = model_df1.drop(['таврія-нова'], axis=1)
print(model_df2.shape)
model_df2.head(2)

In [None]:
vehicle_df5 = pd.concat([vehicle_df4.drop(['model'], axis=1), model_df2], axis=1)
print(vehicle_df5.shape)
vehicle_df5.head(2)

Dummies body

In [None]:
body_df1 = pd.get_dummies(vehicle_df5.body)
print(body_df1.shape)
body_df1.head(2)

In [None]:
body_df2 = body_df1.drop(['station wagon'], axis=1)
print(body_df2.shape)
body_df2.head(2)

In [None]:
vehicle_df6 = pd.concat([vehicle_df5.drop(['body'], axis=1), body_df2], axis=1)
print(vehicle_df6.shape)
vehicle_df6.head(2)

Dummies fuel

In [None]:
fuel_df1 = pd.get_dummies(vehicle_df6.fuel)
print(fuel_df1.shape)
fuel_df1.head(2)

In [None]:
fuel_df2 = fuel_df1.drop(['petrol'], axis=1)
print(fuel_df2.shape)
fuel_df2.head(2)

In [None]:
vehicle_df7 = pd.concat([vehicle_df6.drop(['fuel'], axis=1), fuel_df2], axis=1)
print(vehicle_df7.shape)
vehicle_df7.head(2)

Dummies transmision

In [None]:
transmission_df1 = pd.get_dummies(vehicle_df7.transmission)
print(transmission_df1.shape)
transmission_df1.head(2)

In [None]:
transmission_df2 = transmission_df1.drop(['typtronik'], axis=1)
print(transmission_df2.shape)
transmission_df2.head(2)

In [None]:
vehicle_df8 = pd.concat([vehicle_df7.drop(['transmission'], axis=1), transmission_df2], axis=1)
print(vehicle_df8.shape)
vehicle_df8.head(2)

We have to change type of price column

In [None]:
vehicle_df8['price$'].describe

In [None]:
vehicle_df8['price'] = vehicle_df8['price$'].map(lambda x: x.replace(' ', '').replace('договірна', '0')).astype('int32')

In [None]:
print(vehicle_df8.shape)
vehicle_df8.head(2)

In [None]:
vehicle_df8[(vehicle_df8['price'] == 0)]

In [None]:
vehicle_df8 = vehicle_df8.drop(index=[6061,8590,18925], axis=1)

In [None]:
print(vehicle_df8.shape)
vehicle_df8.head(2)

In [None]:
vehicle_df9 = vehicle_df8.drop(['price$'], axis=1)
print(vehicle_df9.shape)
vehicle_df9.head(2)

**Split our data to x and y datasets**

In [None]:
X = vehicle_df9.drop(['price'], axis=1)
X.shape

In [None]:
y = vehicle_df9['price']
y.shape

Split to train and test datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
print(f"X_train shape is {X_train.shape} and y_train shape is {y_train.shape}")
print(f"X_test shape is {X_test.shape} and y_test shape is {y_test.shape}")

**Let's build model, and find the best solution**

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

**Find best model using GridSearchCV**

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            },
        },
        'random_forest': { 
            'model' : RandomForestRegressor(),
            'params': {
                'n_estimators': [i for i in range(10, 100, 20)],
                'max_depth' : [5,8, None]
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X_train,y_train)

The best result is RandomForestRegressor

In [None]:
model = RandomForestRegressor(n_estimators=50)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

Let's make some prediction.

In [None]:
def predict_price(year,car_mileage,power,brand,model_,body,fuel,transmission):    
    try:
      brand_index = np.where(X.columns==brand)[0][0]
    except IndexError :
      brand_index = 0
    try:
      model_index = np.where(X.columns==model_)[0][0]
    except IndexError :
      model_index = 0
    try:
      body_index = np.where(X.columns==body)[0][0]
    except IndexError :
      body_index = 0
    try:
      fuel_index = np.where(X.columns == fuel)[0][0]
    except IndexError :
      fuel_index = 0
    try:
      fuel_index = np.where(X.columns == fuel)[0][0]
    except IndexError :
      fuel_index = 0
    try:
      transmission_index = np.where(X.columns == transmission)[0][0]
    except IndexError :
      transmission_index = 0
    

    x = np.zeros(len(X.columns), dtype='float32')
    x[0] = year
    x[1] = car_mileage
    x[2] = power
    if brand_index > 0:
        x[brand_index] = 1
    if model_index > 0:
        x[model_index] = 1
    if body_index > 0:
        x[body_index] = 1
    if fuel_index > 0:
        x[fuel_index] = 1
    if transmission_index > 0:
        x[transmission_index] = 1
    pred = model.predict([x])[0]
    return pred

In [None]:
predict_price(2002, 297, 1.8, 'skoda', 'octavia', 'liftback', 'gas', 'manual')

In [None]:
vehicle_df2.head(5)

**Export the tested model to a pickle file**

In [None]:
import pickle
with open('vehicle_prices_model.pickle','wb') as f:
    pickle.dump(model,f)

**Export brand and column information to a file that will be useful later**

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))