In [3]:
import numpy as np
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [5]:
train = pd.read_csv('train.csv')
xtest = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

xtrain = train.drop('Item_Outlet_Sales', axis = 1)
ytrain = np.array((train['Item_Outlet_Sales'] / train['Item_MRP']))

print('Train shape', train.shape)
print('Test shape', xtest.shape)


Train shape (8523, 12)
Test shape (5681, 11)


In [6]:
def target_encoding(train, test, target, cat_features):
    train['target'] = target.copy()
    kf = KFold(n_splits = 5, shuffle = True, random_state = 21)
    # loop
    for train_index, val_index in kf.split(train):
        train_fold, val_fold = train.iloc[train_index], train.iloc[val_index]

        for col in cat_features:
            train.loc[val_index, col+'_min_sales'] = val_fold[col].map(train_fold.groupby(col).target.min())
            train.loc[val_index, col+'_max_sales'] = val_fold[col].map(train_fold.groupby(col).target.max())
            train.loc[val_index, col+'_mean_sales'] = val_fold[col].map(train_fold.groupby(col).target.mean())
            train.loc[val_index, col+'_25perc_sales'] = val_fold[col].map(train_fold.groupby(col).target.quantile(.25).to_dict())
            train.loc[val_index, col+'_75perc_sales'] = val_fold[col].map(train_fold.groupby(col).target.quantile(.75).to_dict())
    # for test set
    for col in cat_features:
        test[col +'_min_sales'] = test[col].map(train.groupby(col).target.min())
        test[col +'_max_sales'] = test[col].map(train.groupby(col).target.max())
        test[col +'_mean_sales'] = test[col].map(train.groupby(col).target.mean())
        test[col +'_25perc_sales'] = test[col].map(train.groupby(col).target.quantile(.25))
        test[col +'_75perc_sales'] = test[col].map(train.groupby(col).target.quantile(.75))
    # return
    train.drop('target', axis = 1, inplace=True)
    return train, test

In [7]:
print('data cleaning')
mapping = {'Low Fat' : 'Low Fat',
           'LF' : 'Low Fat',
           'low fat' : 'Low Fat',
           'Regular' : 'Regular',
           'reg' : 'Regular'}

xtrain['Item_Fat_Content'] = xtrain['Item_Fat_Content'].map(mapping)
xtest['Item_Fat_Content'] = xtest['Item_Fat_Content'].map(mapping)

# Missing value impute
print('Impute missing value')
xtrain['Item_Weight'] = xtrain['Item_Weight'].fillna(xtrain.Item_Weight.mean())
xtrain['Outlet_Size'] = xtrain['Outlet_Size'].fillna('unknown')

xtest['Item_Weight'] = xtest['Item_Weight'].fillna(xtrain.Item_Weight.mean())
xtest['Outlet_Size'] = xtest['Outlet_Size'].fillna('unknown')

# new item identi
xtrain['item_identifier_1'] = xtrain['Item_Identifier'].apply(lambda x: x[:2])
xtrain['item_identifier_2'] = xtrain['Item_Identifier'].apply(lambda x: x[2:3])
xtrain['item_identifier_3'] = xtrain['Item_Identifier'].apply(lambda x: x[3:])

xtest['item_identifier_1'] = xtest['Item_Identifier'].apply(lambda x: x[:2])
xtest['item_identifier_2'] = xtest['Item_Identifier'].apply(lambda x: x[2:3])
xtest['item_identifier_3'] = xtest['Item_Identifier'].apply(lambda x: x[3:])


data cleaning
Impute missing value


In [8]:
cat_features = ['Outlet_Identifier']
xtrain, xtest = target_encoding(xtrain, xtest, ytrain, cat_features)

In [9]:
lbl_cols = ['Item_Fat_Content','Item_Type','Outlet_Size', 'Outlet_Location_Type','Outlet_Type',
                'Outlet_Establishment_Year','item_identifier_1','item_identifier_2','item_identifier_3']

In [10]:
for col in lbl_cols:
        le = LabelEncoder()
        le.fit(xtrain[col].values.tolist() + xtest[col].values.tolist())
        xtrain.loc[:,col] = le.transform(xtrain[col].values.tolist()) 
        xtest.loc[:,col] = le.transform(xtest[col].values.tolist())


In [12]:
print('Drop unused columns')
drop_cols = ['Item_Identifier','Outlet_Identifier','Item_Weight','Item_Visibility']
xtrain.drop(drop_cols, axis = 1, inplace = True)
xtest.drop(drop_cols, axis = 1, inplace = True)


Drop unused columns


In [15]:
# bagged model

model = RandomForestRegressor(
                              n_estimators = trees,
                              max_depth = depth,
                              n_jobs = -1,
                              max_features = 0.8,
                              random_state = seed
                             )
model.fit(xtrain, ytrain)
test_pred = model.predict(xtest)



In [16]:
sub['Item_Outlet_Sales'] = (bagged_test_pred * xtest['Item_MRP'])
sub.to_csv('bagged_rf_final1.csv', index = False)

In [17]:
xtrain

Unnamed: 0,Item_Fat_Content,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,item_identifier_1,item_identifier_2,item_identifier_3,Outlet_Identifier_min_sales,Outlet_Identifier_max_sales,Outlet_Identifier_mean_sales,Outlet_Identifier_25perc_sales,Outlet_Identifier_75perc_sales
0,0,4,249.8092,4,1,0,1,1,0,14,1.986383,45.287406,16.671270,11.139082,21.787185
1,1,14,48.2692,8,1,2,2,0,2,0,1.938413,38.346433,13.797269,9.071936,17.856234
2,0,10,141.6180,4,1,0,1,1,13,14,1.986383,45.287406,16.609480,11.807123,21.147440
3,1,6,182.0950,3,3,2,0,1,23,6,0.964288,7.760892,2.392195,1.012140,3.004921
4,0,9,53.8614,1,0,2,1,2,3,18,1.953669,42.957726,15.870822,11.033828,20.169713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,0,13,214.5218,1,0,2,1,1,5,21,1.953669,49.159356,16.091263,11.028217,20.800337
8519,1,0,108.1570,5,3,1,1,1,18,35,1.979496,39.893479,15.662639,10.691699,19.986370
8520,0,8,85.1224,6,2,1,1,2,9,28,1.969270,44.000000,16.962392,11.871228,21.799540
8521,1,13,103.1332,8,1,2,2,1,13,45,1.938413,38.346433,13.764208,9.087345,17.859242
