# **Price prediction using Random Forest**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/bmw-pricing-challenge/bmw_pricing_challenge.csv')
df.head()

In [None]:
y = df['price']
x = df.drop(['price', 'maker_key'], axis=1)

> ***'maker_key' has only one category we can drop it.***

In [None]:
sns.jointplot(x=x['engine_power'], y=y, kind='reg', height=10)

> ***features are containing boolean values, we can replace it with 0 and 1***

In [None]:
for feature in 'feature_1 feature_2 feature_3 feature_4 feature_5 feature_6 feature_7 feature_8'.split():
    x[feature] = pd.get_dummies(x[feature], drop_first=True)

In [None]:
plt.figure(figsize=(10,10))
corr = pd.concat([x,y], axis=1).corr()
sns.heatmap(corr, annot=True)

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(x=x['model_key'], y=y)

> ***"model_key" has so many category so, for each category we can take a unique number.***

In [None]:
model_key = {key: i for i, key in enumerate(x['model_key'].unique())}
x['model_key'] = x['model_key'].map(model_key)

In [None]:
fuel = {key: i for i, key in enumerate(x['fuel'].unique())}
x['fuel'] = x['fuel'].map(fuel)

In [None]:
x.head()

In [None]:
plt.figure(figsize=(18,5))
plt.subplot(1,2,1)
sns.boxplot(x['car_type'], y)
plt.subplot(1,2,2)
sns.boxplot(x['paint_color'], y)

> ***Here we can see ''car_type'' and ''paint_color'' are not much important for our model so we can drop it.***


In [None]:
x.drop(['car_type', 'paint_color'], axis=1, inplace=True)

In [None]:
x.head()

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(x['sold_at'], y)

* > **'sold_at'** is also not important for our model.
* > And one last thing we should do with 'registration_date', it is date variable

In [None]:
x['registration_date'] = pd.to_datetime(x['registration_date'])
x['registration_year'] = x['registration_date'].dt.year
x['registration_month'] = x['registration_date'].dt.month
x.drop(['registration_date', 'sold_at'], axis=1, inplace=True)

In [None]:
x.head()

## Now its time to divide train and test data

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=5)

## Model selection

In [None]:
params = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30],
    'n_estimators': [1, 5, 10, 50, 100, 150, 200, 250, 300, 350, 400, 500, 1000, 2000],
    'max_leaf_nodes': [5, 10, 15, 20, 30, 40, 50, 55, 60, 70, 80, 85, 90, 95, 100],
    'random_state': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20, 30, 35, 40]
}

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor()
randomizedCV = RandomizedSearchCV(model, param_distributions=params, cv=5, verbose=3)
randomizedCV.fit(x, y)

> **Finding best estimator for our model**

In [None]:
randomizedCV.best_estimator_

In [None]:
model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
                      max_features='auto', max_leaf_nodes=70,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=7, verbose=0,
                      warm_start=False)
model.fit(x, y)
model.score(x_test, y_test)

> **Wow......! Almost 90% accurate our model.**

In [None]:
mean_absolute_error(model.predict(x_test), y_test)