In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from statistics import median
from scipy.stats import iqr

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

### Problem statement - bmw car price prediction model (best possible features for good predictive model)
     - engine_capacity
     - types of fuel (diesel cars are costly; )
     - color /paint of the car
     - no. of seaters
     - mileage 
     - model type
     - safety features (low/medium/large)
     - date of selling (seasonality)
     - IsAutomative (yes/no)
     - 
     --- second hand cars
     - distance travelled 
     - mileage 
     - age of the car
     - warrenty flag
     - insurance flag
     - condition (poor/ok/good)

In [2]:
df = pd.read_csv('bmw_pricing_challenge.csv')
print(df.shape)
# df.info()

(4843, 18)


In [3]:
df.describe()

Unnamed: 0,mileage,engine_power,price
count,4843.0,4843.0,4843.0
mean,140962.8,128.98823,15828.081767
std,60196.74,38.99336,9220.285684
min,-64.0,0.0,100.0
25%,102913.5,100.0,10800.0
50%,141080.0,120.0,14200.0
75%,175195.5,135.0,18600.0
max,1000376.0,423.0,178500.0


In [4]:
df.head()

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at
0,BMW,118,140411,100,01-02-2012,diesel,black,convertible,True,True,False,False,True,True,True,False,11300,01-01-2018
1,BMW,M4,13929,317,01-04-2016,petrol,grey,convertible,True,True,False,False,False,True,True,True,69700,01-02-2018
2,BMW,320,183297,120,01-04-2012,diesel,white,convertible,False,False,False,False,True,False,True,False,10200,01-02-2018
3,BMW,420,128035,135,01-07-2014,diesel,red,convertible,True,True,False,False,True,True,True,True,25100,01-02-2018
4,BMW,425,97097,160,01-12-2014,diesel,silver,convertible,True,True,False,False,False,True,True,True,33400,01-04-2018


### EDA

In [5]:
## duplicate check
df.duplicated().sum()

0

In [6]:
df.duplicated(['model_key', 'mileage', 'engine_power',
       'registration_date', 'fuel', 'paint_color', 'car_type', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7', 'feature_8', 'price', 'sold_at']).sum()

0

In [7]:
### missing values handling
# df.isnull().sum()

    - if missing values, should we drop them or handle them?
    - column wise
        - if drop? if a column has >= 70% of the values missing - drop the column
        - if a column has missing values < 70 % 
            - missing ~ (5-10%) 
            - missing ~ (40 - 50%)
            - missing ~ (60 - 70%) 
             - filling with central tendancy metrics(mean/median/mode), predicitve imputation, correlation imputation, fill with random values between +- 2 standard deviations from mean/median

    - row wise
        - if a column has 5% missing values, don't drop column but drop all those rows
        - 100 rows (classes 0/1 where (0) 95 rows, (1) 5 rows) (Imbalanced dataset -> minority class and majority class)
            index  col    target
            0       NA     1
            1       50     0
            2       NA     0
            3       NA     1
            4       69     1
            5       70     0

In [8]:
# # predicitve imputation
# 10 columns 100 rows
# 9 columns + 10th column (missing values)
# out of 100 rows (75 rows have certain values and 25 rows have missing values)
# build a model on (75 rows+9 columns, target-> 10th columns  )

## Encoding 
 - OHE (usage: nominal)
 - Label Encoding (usage: ordinal, range, intervals )   

In [9]:
bool_f = df.select_dtypes('boolean').columns

# for col in bool_f:
#     print(col, df[col].unique())

In [10]:
for col in bool_f:
    df[col] = df[col].apply(lambda x: 1 if x==True else 0)

In [11]:
# df[bool_f]

In [12]:
# categorical_f = df.select_dtypes('object').columns

In [13]:
# categorical_f = [ 'model_key', 'fuel', 'paint_color','car_type']

In [14]:
# df[categorical_f]

In [15]:
# distinct values for each categorical features
# for col in categorical_f:
#     print(col, df[col].unique(), '=>', df[col].nunique())

In [16]:
## if assigns (1,2,3,4) for encoding fuel ; which level should be assigned with what value?
# fuel ['diesel' 'petrol' 'hybrid_petrol' 'electro'] [3,2,4,1]

#### Encoding 'fuel' types

In [17]:
# ### plot frequency of each of the fuel types and five point summary on price for each fuel type
# fig, ax = plt.subplots(1,2,figsize = (15,4))
# sns.countplot(df.fuel, ax = ax[0])
# sns.boxplot(x ='price',y = 'fuel', data = df, ax = ax[1])
# plt.show()


# ### get plot normalised metrices on price for each fuel type
# fuel_PU = []
# for each in df.fuel.unique():
#     x = df[df.fuel==each]
#     fuel_PU.append(median(x['price'])) # getting median

# fig, ax = plt.subplots(1,2,figsize = (15,4))
# sns.barplot(fuel_PU, df.fuel.unique(), ax = ax[0])
# ax[0].set_xlabel('Median Price')
# # ax[1].set_ylabel('Fuel Type')

# fuel_PU = []
# for each in df.fuel.unique():
#     x = df[df.fuel==each]
#     fuel_PU.append(np.mean(x['price']))

# sns.barplot(fuel_PU, df.fuel.unique(), ax = ax[1])
# ax[1].set_xlabel('Avg. Price')
# # ax[2].set_ylabel('Fuel Type')

# plt.show()

- Quick observations:
    - Median price of hybrid-petrol cars are highest amongst all types, while petrol cars have lowest median price. If median is to be considered as deciding metric then, order of weightage for each fuel types follows:
        - {'diesel':2,'petrol':1,'hybrid_petrol':4, 'electro':3}
    - Average price for each fuel type suggest weightage as: 
        - {'diesel':2,'petrol':1,'hybrid_petrol':4, 'electro':3}
    - Negligible outliers impact on price by fuel type
    - Hence, fuel can be lacoded with the weightage assignment as follows:
     - {'diesel':2,'petrol':1,'hybrid_petrol':4, 'electro':3}

In [18]:
df['fuel_E'] = df.fuel.map({'diesel':2,'petrol':1,'hybrid_petrol':4, 'electro':3})

In [19]:
# ### plot frequency of each of the paint_color and five point summary on price for each paint_color  type
# fig, ax = plt.subplots(1,2,figsize = (15,4))
# sns.countplot(df.paint_color , ax = ax[0]).set_title('Frequency of car paint color')
# sns.boxplot(x ='price',y = 'paint_color', data = df, ax = ax[1])
# plt.show()


# ### get plot normalised metrices on price for each paint_color  type
# paint_color_PU = []
# for each in df.paint_color .unique():
#     x = df[df.paint_color ==each]
#     paint_color_PU.append(median(x['price'])) # getting median

# fig, ax = plt.subplots(1,2,figsize = (15,4)) 
# sns.barplot(sorted(paint_color_PU), df.paint_color.unique()[np.argsort(paint_color_PU)], ax = ax[0])
# ax[0].set_xlabel('Median Price')
# # ax[1].set_ylabel('paint_color  Type')

# paint_color_PU = []
# for each in df.paint_color .unique():
#     x = df[df.paint_color ==each]
#     paint_color_PU.append(np.mean(x['price']))

# sns.barplot(sorted(paint_color_PU), df.paint_color.unique()[np.argsort(paint_color_PU)], ax = ax[1])
# ax[1].set_xlabel('Avg. Price')
# # ax[2].set_ylabel('paint_color  Type')

# plt.show()

In [20]:
# paint_col_each= [np.mean(df[df.paint_color==each]['price']) for each in df.paint_color.unique()]
# # dummy dataframe
# x = pd.DataFrame(df.paint_color.unique(), columns=['paint_color'])
# x['total_avgPrice']  = df['price'].mean()
# x['avgPrice_by_color'] = paint_col_each
# x['diff_avgPrice'] = (x.total_avgPrice - x.avgPrice_by_color)/x.total_avgPrice*100
# x.sort_values('diff_avgPrice')

- Quick observations:
    - Orange cars have almost same median and average price and highest amost all.
    - Green cars have least median and average price.
    - Other colors of the cars median and average prices are changing in order.
    - All the cars paint colors except orange, white, silver and green, average price is within 5% of the total price.
 - following the observations below weightage can be used for label encoding
        - orange -> 5
        - white -> 4
        - remaing -> 3
        - silver -> 2
        - green -> 1


In [21]:
df['paint_color_E'] = df.paint_color.apply(lambda x : 5 if x == 'orange' else (
4 if x == 'white' else (
2 if x == 'silver' else (
1 if x == 'green' else 3))))

In [22]:
# model_key_PU = []
# for each in df.model_key.unique():
#     x = df[df.model_key==each]
#     model_key_PU.append(np.mean(x['price']))
    
# len(model_key_PU)    
# plt.figure(figsize = (15,20))
# x = pd.concat([pd.DataFrame(df.model_key.unique(), columns=['Key']), pd.DataFrame(model_key_PU,columns=['value'])], axis = 1).sort_values('value')
# sns.barplot(x='value',y='Key' ,data = x)
# plt.xlabel('Avg. Price')
# plt.ylabel('Model Keys')
# plt.show()

In [23]:
df['model_key_E'] = df.model_key.apply(lambda x : 5 if x == 'i8' else (
5 if x == 'M4' else (
4 if x == 'X6 M' else (
1 if x == 'X5 M50' else (
1 if x in ('735', '216', '523', '650', '123', 'Z4', '118', '116', '316',
       '630', '318', '114', '220 Active Tourer', '320', '120', '125',
       '216 Active Tourer', 'X1') else (
2 if x in ( '325', '318 Gran Turismo', '525',
       '218 Active Tourer', '520', '218 Gran Tourer', '518', '328', '330',
       '216 Gran Tourer', '218', '320 Gran Turismo', '214 Gran Tourer',
       'X3', '225', '225 Active Tourer', '635', '530', '520 Gran Turismo',
       '528', '325 Gran Turismo', '418 Gran Coupé', '530 Gran Turismo',
       'ActiveHybrid 5', 'i3', '335', '135') else (
3)))))))

In [24]:
# car_type = []
# for each in df.car_type.unique():
#     x = df[df.car_type==each]
#     car_type.append(np.mean(x['price']))
    
# x = pd.concat([pd.DataFrame(df.car_type.unique(), columns=['Key']), pd.DataFrame(car_type,columns=['value'])], axis = 1).sort_values('value', ascending= False)
# plt.figure(figsize = (10,4))
# sns.barplot(x='value',y='Key' ,data = x)
# plt.xlabel('Price')
# plt.ylabel('Car Types')
# plt.show()

In [25]:
df['car_type_E'] = df.car_type.apply(lambda x : 1 if x == 'subcompact' else (
2 if x == 'estate' else (
3 if x == 'hatchback' else (
4 if x == 'van' else (
5 if x == 'sedan' else (
6 if x == 'convertible' else (
7 if x == 'suv' else 8)))))))

#### Capturing interraction effects between features

In [26]:
# ### relation between model_key and mileage
# mileage_N = []
# for each in df.model_key.unique():
#     x = df[df.model_key==each]
#     mileage_N.append(sum(x['mileage'])/len(x))
    
# x = pd.concat([pd.DataFrame(df.model_key.unique(), columns=['Key']), pd.DataFrame(mileage_N,columns=['value'])], axis = 1).sort_values('value', ascending= False)
# plt.figure(figsize = (10,15))
# sns.barplot(x='value',y='Key' ,data = x)
# plt.xlabel('Mileage')
# plt.ylabel('Model Key')

In [27]:
df['model_key_mileage_rel'] = df.model_key.apply(lambda x : 5 if x == '523' else (
4 if x in ('530 Gran Turismo', '735', '635', '525', '335', '530') else (
3 if x in ('535', '325 Gran Turismo', '518', '520', '320',
       '220 Active Tourer', '730', '318', 'M5', '630', '318 Gran Turismo',
       '316', 'M550', '520 Gran Turismo', '335 Gran Turismo', '118', 'X3',
       '120', '330', '320 Gran Turismo', 'X5') else (
2 if x in ('X1', '740', '325', '328',
       'X6', '528', '640 Gran Coupé', '123', '116', '135', 'M3', 'X5 M',
       '750', '640', '418 Gran Coupé', '216 Active Tourer',
       '430 Gran Coupé', '125', 'Z4') else (
1)))))

In [28]:
# fig, ax = plt.subplots(2,2,figsize = (20,8))
# sns.distplot(df.engine_power, ax = ax[0,0])
# sns.distplot(df.mileage, ax = ax[0,1], color = 'green')
# sns.scatterplot(df.mileage, df.price , ax = ax[1,1],color = 'green')
# sns.scatterplot(df.engine_power, df.price , ax = ax[1,0])

In [29]:
# M1 (April) - 80
# M2 (April) - 77

# M1 (May) - 79
# M2 (May) - 79.1

In [30]:
# ### bucketing price in range of 100
# df['EP_bucket'] = df.engine_power.apply(lambda x: '[100)' if x < 100 else ('[100-200)'  if x < 200 else ('[200-300)' if x < 300 else ('[300-400)]' if x < 400 else '[400)') )))
# x = df.groupby('EP_bucket')['price'].mean().reset_index()
# sns.barplot(x = x.EP_bucket, y = x.price).set_ylabel('Avg. price')
# plt.show()
# x = df.groupby('EP_bucket')['price'].count().reset_index()
# sns.barplot(x = x.EP_bucket, y = x.price).set_ylabel('Avg. price')
# plt.show()
# df.groupby('EP_bucket')['price'].describe()

- Note: engine_power is already numerical form and can be feed as a feature in raw form. 
    - more analysis can be done to find any peculiar point from the data 

### target analysis

In [31]:
# fig, ax = plt.subplots(1,3,figsize = (15,4))
# sns.distplot(df.price, color = 'green', ax = ax[0])
# sns.distplot(np.log(df.price), color = 'blue', ax = ax[1])
# sns.distplot(np.sqrt(df.price), color = 'c', ax = ax[2])
# plt.show()

In [32]:
## correlaation 
df[['mileage', 'engine_power','price']].corr()

Unnamed: 0,mileage,engine_power,price
mileage,1.0,-0.050116,-0.409564
engine_power,-0.050116,1.0,0.638989
price,-0.409564,0.638989,1.0


### date related features analysis

In [33]:
df.registration_date = pd.to_datetime(df.registration_date)
df.sold_at = pd.to_datetime(df.sold_at)
df['gap_reg_sold_year'] =  df.sold_at.dt.year - df.registration_date.dt.year 
df['reg_month'] = df.registration_date.dt.month
df['gap_reg_sold_days'] =  df.sold_at - df.registration_date 
df['gap_reg_sold_days'] = df.gap_reg_sold_days.apply(lambda x: int(str(x).split()[0]))

In [34]:
df.head()

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at,fuel_E,paint_color_E,model_key_E,car_type_E,model_key_mileage_rel,gap_reg_sold_year,reg_month,gap_reg_sold_days
0,BMW,118,140411,100,2012-01-02,diesel,black,convertible,1,1,0,0,1,1,1,0,11300,2018-01-01,2,3,1,6,3,6,1,2191
1,BMW,M4,13929,317,2016-01-04,petrol,grey,convertible,1,1,0,0,0,1,1,1,69700,2018-01-02,1,3,5,6,1,2,1,729
2,BMW,320,183297,120,2012-01-04,diesel,white,convertible,0,0,0,0,1,0,1,0,10200,2018-01-02,2,4,1,6,3,6,1,2190
3,BMW,420,128035,135,2014-01-07,diesel,red,convertible,1,1,0,0,1,1,1,1,25100,2018-01-02,2,3,3,6,1,4,1,1456
4,BMW,425,97097,160,2014-01-12,diesel,silver,convertible,1,1,0,0,0,1,1,1,33400,2018-01-04,2,2,3,6,1,4,1,1453


In [35]:
# ### hypothesis: How does the estimated value of a car change over time?
# x = df.groupby('gap_reg_sold_year')['price'].mean().reset_index()
# sns.lineplot(x.gap_reg_sold_year, x.price ,color = 'green').set_xlabel('year diff bet/ registration and sold date')
# plt.show()

--- Practice analysis (perform analysis to answer following questions)

Additional time analysis of price
- price pattern wrt. month
- price pattern weekdays
- price pattern by gap_reg_sold_days

- correlation check 
        - pearson correlation coefficient (degree linear relationship between two continuous variables)
                - V1 V2   Type of correlation
                - C C    -> pearson correlation coefficient
                - C D    -> ANOVA test / spermans correlation coefficient
                - D C    -> ANOVA test
                - D D    -> chi-square test

In [36]:
## correlaation 
df[['mileage', 'engine_power','price','gap_reg_sold_year']].corr()

Unnamed: 0,mileage,engine_power,price,gap_reg_sold_year
mileage,1.0,-0.050116,-0.409564,0.507261
engine_power,-0.050116,1.0,0.638989,-0.082577
price,-0.409564,0.638989,1.0,-0.449878
gap_reg_sold_year,0.507261,-0.082577,-0.449878,1.0


In [37]:
### drop records where mileage is negative and engine_power is 0
df.drop(df[df.mileage < 0].index, axis = 0, inplace = True)
df.drop(df[df.engine_power == 0].index, axis = 0, inplace = True)

In [38]:
df.head()

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at,fuel_E,paint_color_E,model_key_E,car_type_E,model_key_mileage_rel,gap_reg_sold_year,reg_month,gap_reg_sold_days
0,BMW,118,140411,100,2012-01-02,diesel,black,convertible,1,1,0,0,1,1,1,0,11300,2018-01-01,2,3,1,6,3,6,1,2191
1,BMW,M4,13929,317,2016-01-04,petrol,grey,convertible,1,1,0,0,0,1,1,1,69700,2018-01-02,1,3,5,6,1,2,1,729
2,BMW,320,183297,120,2012-01-04,diesel,white,convertible,0,0,0,0,1,0,1,0,10200,2018-01-02,2,4,1,6,3,6,1,2190
3,BMW,420,128035,135,2014-01-07,diesel,red,convertible,1,1,0,0,1,1,1,1,25100,2018-01-02,2,3,3,6,1,4,1,1456
4,BMW,425,97097,160,2014-01-12,diesel,silver,convertible,1,1,0,0,0,1,1,1,33400,2018-01-04,2,2,3,6,1,4,1,1453


## Modelling



In [39]:
# import packages
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_log_error

In [40]:
features = ['mileage', 'engine_power', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7', 'feature_8', 'fuel_E', 'paint_color_E',
       'model_key_E', 'car_type_E', 'model_key_mileage_rel','gap_reg_sold_year']

target = 'price'

In [41]:
## splitting data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size = 0.2, random_state = 42 )

In [42]:
### Model with default parameters; features scaling has not happened
RFModel = RandomForestRegressor().fit(X_train,y_train) # training
## 
print('Train R^2 score {}'.format(r2_score(y_train, RFModel.predict(X_train))))
print('Test R^2 score {}'.format(r2_score(y_test, RFModel.predict(X_test))))

print('Train mean_squared_log_error {}'.format(mean_squared_log_error(y_train, RFModel.predict(X_train))))
print('Test mean_squared_log_error {}'.format(mean_squared_log_error(y_test, RFModel.predict(X_test))))

Train R^2 score 0.9699792070413173
Test R^2 score 0.8300708348034823
Train mean_squared_log_error 0.037747102763997745
Test mean_squared_log_error 0.15259793846632977


In [43]:
# - LinearRegression => assumptions (linear rel., dataset size should small, not too many categorical features, no multi-correlationary, normality, homoscedacity, etc)
# - RandomForestRegressor => adv. (handles non-linearity, to big data, lot of categorical features) ; dis(prone to overfit, highly unstable)
# - SVR => adv. (kernal); dis (computationally expensive)

In [44]:
## scaling features
scaler = StandardScaler().fit(X_train,y_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [45]:
# AIC, BIC

In [46]:
## Get r-suqare score with each models
for model in [LinearRegression(), Ridge(), RandomForestRegressor(),SVR()]:
    scores = cross_val_score(model, X_train, y_train, cv= 5, scoring='r2',n_jobs=-1)
    print('Model =>',str(model))
    print('Avg. R^2_score: {}, std:{}, scores:{}'.format(np.mean(scores),np.std(scores), scores))

Model => LinearRegression()
Avg. R^2_score: 0.7214168743899274, std:0.12561179225406155, scores:[0.77475499 0.47120839 0.77058777 0.80291597 0.78761725]
Model => Ridge()
Avg. R^2_score: 0.7214193140339191, std:0.12561003132796467, scores:[0.77476382 0.4712138  0.77058516 0.80290741 0.78762638]
Model => RandomForestRegressor()
Avg. R^2_score: 0.8097154363845716, std:0.12982224968807457, scores:[0.85497366 0.5514235  0.87046327 0.89635263 0.87536413]
Model => SVR()
Avg. R^2_score: -0.015708698951961475, std:0.010982894758966452, scores:[-0.0117084  -0.00742686 -0.01022372 -0.01173519 -0.03744933]


In [47]:
# ### Model with default parameters; features are scaled
# RFModel = RandomForestRegressor().fit(X_train,y_train) # training
# ## 
# print('Test R^2 score {}'.format(r2_score(y_test, RFModel.predict(X_test))))
# print('Test mean_squared_log_error {}'.format(mean_squared_log_error(y_test, RFModel.predict(X_test))))

In [48]:
### Model with default parameters; features are scaled
RFModel = RandomForestRegressor(random_state=42).fit(X_train,y_train) # training
## 
print('Test R^2 score {}'.format(r2_score(y_test, RFModel.predict(X_test))))
print('Test mean_squared_log_error {}'.format(mean_squared_log_error(y_test, RFModel.predict(X_test))))

Test R^2 score 0.8047317402215458
Test mean_squared_log_error 0.1516159195455402


In [49]:
### Model with default parameters; features are scaled
RFModel = RandomForestRegressor(oob_score=True,random_state=42).fit(X_train,y_train) # training
## 
# print('Test R^2 score {}'.format(r2_score(y_train, RFModel.predict(X_train))))
print('Test R^2 score {}'.format(r2_score(y_test, RFModel.predict(X_test))))
print('Test mean_squared_log_error {}'.format(mean_squared_log_error(y_test, RFModel.predict(X_test))))

Test R^2 score 0.8047317402215458
Test mean_squared_log_error 0.1516159195455402


## 
- hyper parameter tunning
- feature engineering

In [50]:
RandomForestRegressor

sklearn.ensemble._forest.RandomForestRegressor

- Is the model overfitted?
- can we reply on only R^2 score?
- Adjusted R^2 score

In [51]:
# no. of trees -> 550 
# (10 - 1000) -> 
# (400 - 700)
# 100, 1000 => 100
# parameter value : scores

#### Hyper parameters tunning
    - GridSearchCV() => adv(we cover whole feature space) ; dis(computationally expensive)
    - RandomSearchCV() => adv(might miss few feature space) ; adv(computationally inexpensive)
    -  Normal Loop
    
 Hyper parameters of randomforest
- no. of trees
- no. samples 
- depth
- max features

In [52]:
# n_estimators=100,
#     *,
#     criterion='mse',
#     max_depth=None,
#     min_samples_split=2,
#     min_samples_leaf=1,
#     min_weight_fraction_leaf=0.0,
#     max_features='auto',
#     max_leaf_nodes=None,
#     min_impurity_decrease=0.0,
#     min_impurity_split=None,
#     bootstrap=True,
#     oob_score=False,
#     n_jobs=None,

In [53]:
param = {'n_estimators':range(50,200,50),
        'max_depth': [2,3,4,5,6,7]}

grid = GridSearchCV(estimator = RFModel, param_grid=param, cv=3, scoring = 'r2', n_jobs = -1)
grid = grid.fit(X_train,y_train)

In [54]:
grid.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=3,
             estimator=RandomForestRegressor(oob_score=True, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7],
                         'n_estimators': range(50, 200, 50)},
             scoring='r2')>

In [55]:
grid.best_estimator_

RandomForestRegressor(max_depth=7, n_estimators=150, oob_score=True,
                      random_state=42)

In [56]:
grid.best_score_

0.7669781755922026

In [57]:
pd.DataFrame(grid.cv_results_).sort_values(by = 'mean_test_score', ascending = False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
17,1.204308,0.148679,0.030318,0.008987,7,150,"{'max_depth': 7, 'n_estimators': 150}",0.617916,0.824859,0.85816,0.766978,0.106276,1
16,1.006422,0.026229,0.028317,0.002624,7,100,"{'max_depth': 7, 'n_estimators': 100}",0.61833,0.824893,0.855936,0.766386,0.105456,2
15,0.559012,0.045611,0.015324,0.004712,7,50,"{'max_depth': 7, 'n_estimators': 50}",0.619444,0.809148,0.855003,0.761198,0.101969,3
14,1.463494,0.076382,0.037979,0.007114,6,150,"{'max_depth': 6, 'n_estimators': 150}",0.606748,0.818227,0.839618,0.754864,0.105097,4
13,0.913809,0.016407,0.032648,0.002356,6,100,"{'max_depth': 6, 'n_estimators': 100}",0.606773,0.817206,0.834961,0.75298,0.103638,5


The current scores can be further improved by selecting optimal feature space. Kindly work on them as discussed.

##### Advanced Model
    - Xgboost (boosting)
    - RandomForest (bagging)