In [1]:
import pandas as pd
import numpy as np

In [59]:
dataset_path = '../data/DS_ML Coding Challenge Dataset.xlsx'
train_dataset = pd.read_excel(dataset_path, sheet_name='Training Dataset')
test_dataset = pd.read_excel(dataset_path, sheet_name='Test Dataset')

In [60]:
a = train_dataset.copy()
b = test_dataset.copy()

In [61]:
train_dataset = a.copy()
test_dataset = b.copy()

In [62]:
def preprocess_data(dataset):
    '''
    Returns X and y after converting categorical variables to one-hot encoding and creating time features
    '''
    
    # Renaming column and removing spaces
    dataset.rename(columns={'ProductType':'ProductName'}, inplace=True)
    dataset.columns = [column_name.replace(' ','') for column_name in dataset.columns]
    
    # Creating time features
    dataset['Year'] = pd.DatetimeIndex(dataset['MonthofSourcing']).year
    dataset['Month'] = pd.DatetimeIndex(dataset['MonthofSourcing']).month
    
    # Creating one-hot-encoding for categorical variables
    dataset = pd.get_dummies(dataset, columns=['ProductName'], drop_first=True, prefix='ProductName')
    dataset = pd.get_dummies(dataset, columns=['Manufacturer'], drop_first=True, prefix='Manufacturer')
    dataset = pd.get_dummies(dataset, columns=['AreaCode'], drop_first=True, prefix='AreaCode')
    dataset = pd.get_dummies(dataset, columns=['SourcingChannel'], drop_first=True, prefix='SourcingChannel')
    dataset = pd.get_dummies(dataset, columns=['ProductSize'], drop_first=True, prefix='ProductSize')
    dataset = pd.get_dummies(dataset, columns=['ProductType'], drop_first=True, prefix='ProductType')
    
    # Creating X and y
    X = dataset.drop(['MonthofSourcing','SourcingCost'], axis=1).values
    y = dataset['SourcingCost'].values
    
    return X, y

In [63]:
X_train, y_train = preprocess_data(train_dataset)
X_test, y_test = preprocess_data(test_dataset)

### Tree Based Models

In [70]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [74]:
regressors = [DecisionTreeRegressor(), RandomForestRegressor(), ExtraTreesRegressor(), AdaBoostRegressor(),\
              GradientBoostingRegressor()]

model_metrics = {}
for reg in regressors:
    print('Started Training', reg.__class__.__name__)
    trained_model = reg.fit(X_train, y_train)
    y_pred = trained_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    model_metrics[reg.__class__.__name__] = [rmse, r2]
    print('Ended Training', reg.__class__.__name__)

In [75]:
model_metrics

{'DecisionTreeRegressor': [32.771920642592434, 0.6041226046886132],
 'RandomForestRegressor': [32.67310545889444, 0.6065063356788155],
 'ExtraTreesRegressor': [33.026025248685436, 0.5979597519786245],
 'AdaBoostRegressor': [42.09754582155006, 0.3467634571434871],
 'GradientBoostingRegressor': [34.22646368488119, 0.5682016540127037]}

### Voting Regressor

In [93]:
k = [(r.__class__.__name__,r) for r in regressors.copy()[1:]]

In [95]:
vr = VotingRegressor(k)
vr.fit(X_train, y_train)
y_pred = vr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(vr.__class__.__name__ , ':',rmse, r2)

VotingRegressor : 29.34400767608154 0.6826081503469329


### Light GBM

In [101]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(lgbm.__class__.__name__ , ':',rmse, r2)

LGBMRegressor : 31.832869673619847 0.6264846128415488


### XGBoost

In [102]:
from xgboost import XGBRegressor
xgbm = XGBRegressor()
xgbm.fit(X_train, y_train)
y_pred = xgbm.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(xgbm.__class__.__name__ , ':',rmse, r2)

XGBRegressor : 31.84472169871585 0.6262064263065128
