# Advanced Machine Learning Application - Assignment 2
### Rohan Rocky Britto - Student ID: 24610990

## Data Import and Preparation

Importing required packages

In [1]:
import pandas as pd
from joblib import dump, load
from sklearn.pipeline import Pipeline

Importing the function developed and saved to evaluate the model

In [2]:
import sys
sys.path.append('../../src')
from functions import evaluate_model

Read the training and validation files

In [3]:
df_train = pd.read_csv('../../data/processed/train_processed.csv')
df_validation = pd.read_csv('../../data/processed/validation_processed.csv')

  df_train = pd.read_csv('../../data/processed/train_processed.csv')
  df_validation = pd.read_csv('../../data/processed/validation_processed.csv')


Storing the target values in a separate variable

In [4]:
train_target = df_train['sale_revenue']
validation_target = df_validation['sale_revenue']

Load the preprocessor pipeline

In [5]:
preprocessor_pipe = load('../../src/preprocessor_pipeline.joblib')

## Model Building

### Decision Tree

Build a decision tree model and evaluate its performance

In [6]:
from sklearn.tree import DecisionTreeRegressor

In [7]:
dtr_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor_pipe),
        ('dtr', DecisionTreeRegressor(random_state=8))
    ]
)

In [8]:
%%time
dtr_pipe.fit(df_train, train_target)

CPU times: total: 2min 57s
Wall time: 4min 31s


In [9]:
train_preds = dtr_pipe.predict(df_train)

In [10]:
validation_preds = dtr_pipe.predict(df_validation)

In [11]:
evaluate_model(train_target, train_preds, validation_target, validation_preds)

The Mean Absolute Error for training set is  3.391042623025225
The Mean Absolute Error for validation set is  6.88938695980446
The Root Mean Squared Error for training set is  7.974908065841837
The Root Mean Squared Error for validation set is  14.266072918946087


In [12]:
dump(dtr_pipe, '../../models/predictive/dtr_pipe.joblib', compress=3)

['../../models/predictive/dtr_pipe.joblib']

In [13]:
df_train['dtr_preds'] = train_preds
df_validation['dtr_preds'] = validation_preds

### Adaboost

Build a adaboost model and evaluate its performance

In [14]:
from sklearn.ensemble import AdaBoostRegressor

In [15]:
adb_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor_pipe),
        ('adb', AdaBoostRegressor(random_state=8, n_estimators=100))
    ]
)

In [16]:
%%time
adb_pipe.fit(df_train, train_target)

CPU times: total: 4min 43s
Wall time: 9min 9s


In [17]:
train_preds = adb_pipe.predict(df_train)

In [18]:
validation_preds = adb_pipe.predict(df_validation)

In [19]:
evaluate_model(train_target, train_preds, validation_target, validation_preds)

The Mean Absolute Error for training set is  7.084637991693634
The Mean Absolute Error for validation set is  7.153253882854609
The Root Mean Squared Error for training set is  12.403508894278552
The Root Mean Squared Error for validation set is  13.298755768992525


In [20]:
dump(adb_pipe, '../../models/predictive/adb_pipe.joblib', compress=3)

['../../models/predictive/adb_pipe.joblib']

In [21]:
df_train['adb_preds'] = train_preds
df_validation['adb_preds'] = validation_preds

### Random Forest

Build a random forest model and evaluate its performance

In [22]:
from sklearn.ensemble import RandomForestRegressor

In [23]:
rf_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor_pipe),
        ('rf', RandomForestRegressor(random_state=8, n_estimators=25, max_depth=15, min_samples_leaf=3))
    ]
)

In [24]:
%%time
rf_pipe.fit(df_train, train_target)

CPU times: total: 4min 19s
Wall time: 16min 24s


In [25]:
train_preds = rf_pipe.predict(df_train)

In [26]:
validation_preds = rf_pipe.predict(df_validation)

In [27]:
evaluate_model(train_target, train_preds, validation_target, validation_preds)

The Mean Absolute Error for training set is  4.643331423752109
The Mean Absolute Error for validation set is  5.07869084682644
The Root Mean Squared Error for training set is  8.218036165037551
The Root Mean Squared Error for validation set is  10.353961009838791


In [28]:
dump(rf_pipe, '../../models/predictive/rf_pipe.joblib', compress=3)

['../../models/predictive/rf_pipe.joblib']

In [29]:
df_train['rf_preds'] = train_preds
df_validation['rf_preds'] = validation_preds

Let us have a quick look at a couple of predictions from all these models to decide which one performs best

In [30]:
df_train.sample(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,items_sold,date,wm_yr_wk,event_name,event_type,sell_price,num_date,day_of_week,sale_revenue,dtr_preds,adb_preds,rf_preds
1421169,FOODS_2_289_TX_1_evaluation,FOODS_2_289,FOODS_2,FOODS,TX_1,TX,d_216,2,2011-09-01,11131,,,2.58,20110901,3,5.16,2.58,9.910228,5.710301
8287095,HOUSEHOLD_1_029_WI_3_evaluation,HOUSEHOLD_1_029,HOUSEHOLD_1,HOUSEHOLD,WI_3,WI,d_1007,2,2013-10-31,11340,Halloween,Cultural,6.48,20131031,3,12.96,6.48,17.304033,14.188916
714576,FOODS_3_328_TX_2_evaluation,FOODS_3_328,FOODS_3,FOODS,TX_2,TX,d_110,1,2011-05-18,11116,,,3.5,20110518,2,3.5,3.5,9.910228,6.04176
8768105,HOUSEHOLD_1_078_WI_2_evaluation,HOUSEHOLD_1_078,HOUSEHOLD_1,HOUSEHOLD,WI_2,WI,d_1056,1,2013-12-19,11347,,,2.84,20131219,3,2.84,10.41,9.910228,4.756225
2897194,HOUSEHOLD_2_437_CA_3_evaluation,HOUSEHOLD_2_437,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,d_417,4,2012-03-20,11208,,,7.67,20120320,1,30.68,17.94,17.304033,22.58393
8348546,FOODS_3_592_WI_1_evaluation,FOODS_3_592,FOODS_3,FOODS,WI_1,WI,d_1013,2,2013-11-06,11341,,,7.48,20131106,2,14.96,7.48,19.743875,16.043417
213964,FOODS_3_317_TX_2_evaluation,FOODS_3_317,FOODS_3,FOODS,TX_2,TX,d_34,3,2011-03-03,11105,,,1.18,20110303,3,3.54,3.54,8.840304,4.724436
6560377,HOUSEHOLD_1_177_TX_1_evaluation,HOUSEHOLD_1_177,HOUSEHOLD_1,HOUSEHOLD,TX_1,TX,d_835,2,2013-05-12,11316,Mother's day,Cultural,7.5,20130512,6,15.0,15.0,24.516894,17.980739
5485299,FOODS_2_074_CA_1_evaluation,FOODS_2_074,FOODS_2,FOODS,CA_1,CA,d_722,7,2013-01-19,11252,,,2.5,20130119,5,17.5,17.5,9.910228,8.754015
3929096,HOUSEHOLD_1_177_CA_4_evaluation,HOUSEHOLD_1_177,HOUSEHOLD_1,HOUSEHOLD,CA_4,CA,d_542,4,2012-07-23,11226,,,7.97,20120723,0,31.88,23.91,24.516894,15.377464


In [31]:
df_validation.sample(10)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,items_sold,date,wm_yr_wk,event_name,event_type,sell_price,num_date,day_of_week,sale_revenue,dtr_preds,adb_preds,rf_preds
345715,FOODS_3_676_WI_2_evaluation,FOODS_3_676,FOODS_3,FOODS,WI_2,WI,d_1106,1,2014-02-07,11401,,,7.48,20140207,4,7.48,7.78,17.304033,13.872792
2095017,FOODS_3_717_TX_3_evaluation,FOODS_3_717,FOODS_3,FOODS,TX_3,TX,d_1269,1,2014-07-20,11425,,,1.98,20140720,6,1.98,10.0,8.840304,3.016835
1001108,HOUSEHOLD_2_225_CA_3_evaluation,HOUSEHOLD_2_225,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,d_1169,5,2014-04-11,11410,,,0.97,20140411,4,4.85,0.97,9.910228,1.281313
4297035,FOODS_3_406_CA_2_evaluation,FOODS_3_406,FOODS_3,FOODS,CA_2,CA,d_1471,7,2015-02-07,11502,,,0.8,20150207,5,5.6,8.8,9.910228,6.798206
656147,FOODS_3_271_TX_1_evaluation,FOODS_3_271,FOODS_3,FOODS,TX_1,TX,d_1136,1,2014-03-09,11406,,,4.48,20140309,6,4.48,8.96,11.869787,9.475766
63456,FOODS_3_144_TX_1_evaluation,FOODS_3_144,FOODS_3,FOODS,TX_1,TX,d_1078,1,2014-01-10,11350,,,1.88,20140110,4,1.88,1.88,8.840304,3.91621
2263428,FOODS_3_110_TX_1_evaluation,FOODS_3_110,FOODS_3,FOODS,TX_1,TX,d_1285,2,2014-08-05,11427,,,9.78,20140805,1,19.56,8.96,12.052176,14.537419
1739089,FOODS_3_441_WI_1_evaluation,FOODS_3_441,FOODS_3,FOODS,WI_1,WI,d_1236,6,2014-06-17,11420,,,6.48,20140617,1,38.88,6.48,33.879565,22.867141
2969802,FOODS_3_011_CA_2_evaluation,FOODS_3_011,FOODS_3,FOODS,CA_2,CA,d_1349,2,2014-10-08,11436,,,1.98,20141008,2,3.96,1.98,8.840304,4.670149
1091024,FOODS_3_785_TX_1_evaluation,FOODS_3_785,FOODS_3,FOODS,TX_1,TX,d_1177,4,2014-04-19,11412,,,3.0,20140419,5,12.0,13.5,24.516894,18.542888


## Evaluate the model on test_data

Fetch and preprocess test data

In [32]:
def fetch_test_data():
    # Read files
    df_train = pd.read_csv('../../data/raw/sales_train.csv')
    df_test = pd.read_csv('../../data/raw/sales_test.csv')
    df_calendar = pd.read_csv('../../data/raw/calendar.csv')
    df_events = pd.read_csv('../../data/raw/calendar_events.csv')
    df_sell_prices = pd.read_csv('../../data/raw/items_weekly_sell_prices.csv')
    
    # Concat the key fields from training dataset to test data
    df_test = pd.concat([df_train[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']], df_test], axis=1)
    # Unpivot data
    df_test = df_test.melt(id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='items_sold')
    df_test = df_test[df_test['items_sold']!=0]
    df_test.reset_index(drop=True, inplace=True)
    # Join dates with events
    df_calendar = df_calendar.join(df_events.set_index('date'), on='date').fillna('None')
    df_test = df_test.join(df_calendar.set_index('d'), on='d')
    # Join weekly sell prices with the items in the store
    df_test = df_test.join(df_sell_prices.set_index(['store_id', 'item_id', 'wm_yr_wk']), on=['store_id', 'item_id', 'wm_yr_wk'])
    # Fetch date features
    df_test['date'] = pd.to_datetime(df_test['date'])
    df_test['num_date'] = df_test['date'].dt.strftime('%Y%m%d')
    df_test['day_of_week'] = df_test['date'].dt.dayofweek
    df_test['sale_revenue'] = df_test['items_sold'] * df_test['sell_price']

    return df_test


In [33]:
df_test = fetch_test_data()

In [34]:
df_test.to_csv('../../data/processed/test_processed.csv', index=False)

In [35]:
df_test.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,items_sold,date,wm_yr_wk,event_name,event_type,sell_price,num_date,day_of_week,sale_revenue
0,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1542,4,2015-04-19,11512,,,4.64,20150419,6,18.56
1,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1542,3,2015-04-19,11512,,,2.88,20150419,6,8.64
2,HOBBIES_1_006_CA_1_evaluation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,d_1542,2,2015-04-19,11512,,,1.0,20150419,6,2.0
3,HOBBIES_1_007_CA_1_evaluation,HOBBIES_1_007,HOBBIES_1,HOBBIES,CA_1,CA,d_1542,1,2015-04-19,11512,,,7.88,20150419,6,7.88
4,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_1542,4,2015-04-19,11512,,,0.48,20150419,6,1.92


Store the target values in a separate variable

In [36]:
test_target = df_test['sale_revenue']

Use each model to predict on the test dataframe

In [37]:
test_preds_dtr = dtr_pipe.predict(df_test)



In [38]:
test_preds_adb = adb_pipe.predict(df_test)



In [39]:
test_preds_rf = rf_pipe.predict(df_test)



In [40]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
print('The Mean Absolute Error for Decision Tree on testing set is ', mean_absolute_error(test_target, test_preds_dtr))
print('The Root Mean Squared Error for Decision Tree on testing set is ', mean_squared_error(test_target, test_preds_dtr, squared=False))

print('The Mean Absolute Error for Adaboost on testing set is ', mean_absolute_error(test_target, test_preds_adb))
print('The Root Mean Squared Error for Adaboost on testing set is ', mean_squared_error(test_target, test_preds_adb, squared=False))

print('The Mean Absolute Error for Random Forest on testing set is ', mean_absolute_error(test_target, test_preds_rf))
print('The Root Mean Squared Error for Random Forest on testing set is ', mean_squared_error(test_target, test_preds_rf, squared=False))

The Mean Absolute Error for Decision Tree on testing set is  7.2873548439964235
The Root Mean Squared Error for Decision Tree on testing set is  16.34762653997853
The Mean Absolute Error for Adaboost on testing set is  7.288600171923203
The Root Mean Squared Error for Adaboost on testing set is  14.614817292419465
The Mean Absolute Error for Random Forest on testing set is  5.395832585166629
The Root Mean Squared Error for Random Forest on testing set is  12.853976589538565


**Conclusion:** Looking at the mean absolute error, root mean squared error and some sample predictions, Random Forest seems to be performing the best compared to all others. We will be using it to make predictions.