# M5 Forcasting - Rolling Prediction

Xiao Song

<https://xsong.ltd/en>     
[Kaggle profile](https://www.kaggle.com/rikdifos/)

This notebook is forked from [M5 First Public Notebook Under 0.50](https://www.kaggle.com/kneroma/m5-first-public-notebook-under-0-50), thanks to the author [kkiller](https://www.kaggle.com/kneroma)!

I modified several places to make it more general to predicting test set.

This notebook will output a 'submission4.csv' file.


In [1]:
import warnings
warnings.filterwarnings('ignore')
from  datetime import datetime, timedelta
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb

In [2]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [4]:
h = 28 
max_lags = 57
tr_last = 1941
fday = datetime(2016,5, 23) 
fday

datetime.datetime(2016, 5, 23, 0, 0)

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_evaluation.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    dt = reduce_mem_usage(dt)
    return dt

In [7]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [8]:
FIRST_DAY = 350 # If you want to load all the data set it to '1' -->  Great  memory overflow  risk !

In [9]:
%%time

df = create_dt(is_train=True, first_day= FIRST_DAY)
df = reduce_mem_usage(df)
df.shape

Mem. usage decreased to 2339.12 Mb (26.2% reduction)
Mem. usage decreased to 2339.12 Mb (0.0% reduction)
CPU times: user 47.6 s, sys: 14.7 s, total: 1min 2s
Wall time: 1min 2s


(41571939, 22)

In [10]:
df

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.970703
1,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.339844
2,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.480469
3,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.500000
4,HOBBIES_1_009_CA_1_evaluation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.769531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41571934,FOODS_3_825_WI_3_evaluation,3046,6,9,2,2,d_1941,2.0,2016-05-22,11617,3,2,5,2016,0,0,0,0,0.0,0.0,0.0,3.980469
41571935,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1940,1.0,2016-05-21,11617,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.280273
41571936,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1941,0.0,2016-05-22,11617,3,2,5,2016,0,0,0,0,0.0,0.0,0.0,1.280273
41571937,FOODS_3_827_WI_3_evaluation,3048,6,9,2,2,d_1940,5.0,2016-05-21,11617,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.000000


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41571939 entries, 0 to 41571938
Data columns (total 22 columns):
id              object
item_id         int16
dept_id         int8
store_id        int8
cat_id          int8
state_id        int8
d               object
sales           float16
date            datetime64[ns]
wm_yr_wk        int16
weekday         int8
wday            int8
month           int8
year            int16
event_name_1    int8
event_type_1    int8
event_name_2    int8
event_type_2    int8
snap_CA         float16
snap_TX         float16
snap_WI         float16
sell_price      float16
dtypes: datetime64[ns](1), float16(5), int16(3), int8(11), object(2)
memory usage: 2.3+ GB


In [12]:
%%time

create_fea(df)
df.shape

CPU times: user 3min 31s, sys: 14.9 s, total: 3min 45s
Wall time: 3min 45s


(41571939, 31)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41571939 entries, 0 to 41571938
Data columns (total 31 columns):
id              object
item_id         int16
dept_id         int8
store_id        int8
cat_id          int8
state_id        int8
d               object
sales           float16
date            datetime64[ns]
wm_yr_wk        int16
weekday         int8
wday            int16
month           int16
year            int16
event_name_1    int8
event_type_1    int8
event_name_2    int8
event_type_2    int8
snap_CA         float16
snap_TX         float16
snap_WI         float16
sell_price      float16
lag_7           float16
lag_28          float16
rmean_7_7       float16
rmean_28_7      float16
rmean_7_28      float16
rmean_28_28     float16
week            int16
quarter         int16
mday            int16
dtypes: datetime64[ns](1), float16(11), int16(8), int8(9), object(2)
memory usage: 3.1+ GB


In [14]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.970703,,,,,,,2,1,13
1,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.339844,,,,,,,2,1,13
2,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.480469,,,,,,,2,1,13
3,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.5,,,,,,,2,1,13
4,HOBBIES_1_009_CA_1_evaluation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.769531,,,,,,,2,1,13


In [15]:
#df.dropna(inplace = True)
df.shape

(41571939, 31)

In [16]:
df

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_002_CA_1_evaluation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.970703,,,,,,,2,1,13
1,HOBBIES_1_004_CA_1_evaluation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.339844,,,,,,,2,1,13
2,HOBBIES_1_005_CA_1_evaluation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.480469,,,,,,,2,1,13
3,HOBBIES_1_008_CA_1_evaluation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.500000,,,,,,,2,1,13
4,HOBBIES_1_009_CA_1_evaluation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.769531,,,,,,,2,1,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41571934,FOODS_3_825_WI_3_evaluation,3046,6,9,2,2,d_1941,2.0,2016-05-22,11617,3,2,5,2016,0,0,0,0,0.0,0.0,0.0,3.980469,1.0,0.0,1.142578,0.571289,0.714355,0.893066,20,2,22
41571935,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1940,1.0,2016-05-21,11617,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.280273,1.0,1.0,1.000000,0.856934,1.142578,0.964355,20,2,21
41571936,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1941,0.0,2016-05-22,11617,3,2,5,2016,0,0,0,0,0.0,0.0,0.0,1.280273,1.0,3.0,1.000000,1.142578,1.142578,0.928711,20,2,22
41571937,FOODS_3_827_WI_3_evaluation,3048,6,9,2,2,d_1940,5.0,2016-05-21,11617,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.000000,2.0,0.0,0.714355,0.000000,0.571289,1.250000,20,2,21


In [17]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~ df.columns.isin(useless_cols)]  # columns for training.

x_train = df[df['date'] <= '2016-04-24']
y_train = x_train['sales']

x_val = df[(df['date'] > '2016-04-24') & (df['date'] <= '2016-05-22')]
y_val = x_val['sales']

x_train = x_train[train_cols]
x_val = x_val[train_cols]
del df

In [18]:
x_train

Unnamed: 0,item_id,dept_id,store_id,cat_id,state_id,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
0,1,0,0,0,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.970703,,,,,,,2,1,13
1,3,0,0,0,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.339844,,,,,,,2,1,13
2,4,0,0,0,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.480469,,,,,,,2,1,13
3,7,0,0,0,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.500000,,,,,,,2,1,13
4,8,0,0,0,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.769531,,,,,,,2,1,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40870649,3046,6,9,2,2,2,4,2016,0,0,0,0,0.0,0.0,0.0,3.980469,0.0,1.0,1.000000,0.714355,0.928711,1.250000,16,2,24
40870655,3047,6,9,2,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.280273,0.0,2.0,0.856934,1.142578,1.036133,1.107422,16,2,23
40870656,3047,6,9,2,2,2,4,2016,0,0,0,0,0.0,0.0,0.0,1.280273,1.0,4.0,0.714355,1.571289,1.036133,1.250000,16,2,24
40870662,3048,6,9,2,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.000000,0.0,0.0,0.000000,2.285156,1.821289,1.786133,16,2,23


In [19]:
x_val

Unnamed: 0,item_id,dept_id,store_id,cat_id,state_id,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,mday
40657241,0,0,0,0,0,3,4,2016,0,0,0,0,0.0,0.0,0.0,8.382812,1.0,1.0,0.856934,1.142578,0.964355,1.036133,17,2,25
40657242,0,0,0,0,0,4,4,2016,0,0,0,0,0.0,0.0,0.0,8.382812,1.0,0.0,1.000000,0.856934,0.928711,0.964355,17,2,26
40657243,0,0,0,0,0,5,4,2016,0,0,0,0,0.0,0.0,0.0,8.382812,1.0,0.0,1.142578,0.571289,0.893066,0.964355,17,2,27
40657244,0,0,0,0,0,6,4,2016,0,0,0,0,0.0,0.0,0.0,8.382812,3.0,0.0,1.428711,0.571289,1.000000,0.928711,17,2,28
40657245,0,0,0,0,0,7,4,2016,0,0,0,0,0.0,0.0,0.0,8.382812,0.0,0.0,1.286133,0.428467,0.964355,0.893066,17,2,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41571934,3046,6,9,2,2,2,5,2016,0,0,0,0,0.0,0.0,0.0,3.980469,1.0,0.0,1.142578,0.571289,0.714355,0.893066,20,2,22
41571935,3047,6,9,2,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.280273,1.0,1.0,1.000000,0.856934,1.142578,0.964355,20,2,21
41571936,3047,6,9,2,2,2,5,2016,0,0,0,0,0.0,0.0,0.0,1.280273,1.0,3.0,1.000000,1.142578,1.142578,0.928711,20,2,22
41571937,3048,6,9,2,2,1,5,2016,0,0,0,0,0.0,0.0,0.0,1.000000,2.0,0.0,0.714355,0.000000,0.571289,1.250000,20,2,21


In [None]:
'''
train_data = lgb.Dataset(X_train, label = y_train, categorical_feature=cat_feats, free_raw_data=False)
fake_valid_inds = np.random.choice(len(X_train), 1000000, replace = False)
fake_valid_data = lgb.Dataset(X_train.iloc[fake_valid_inds], label = y_train.iloc[fake_valid_inds],         categorical_feature=cat_feats,free_raw_data=False)
# This is just a subsample of the training set, not a real validation set !
'''

In [21]:
%%time
train_set = lgb.Dataset(x_train, y_train, categorical_feature = cat_feats)
val_set = lgb.Dataset(x_val, y_val, categorical_feature = cat_feats)

CPU times: user 33 µs, sys: 4 µs, total: 37 µs
Wall time: 40.5 µs


In [22]:
del x_train, y_train, x_val, y_val
gc.collect()

0

In [23]:
params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'poisson',
        'seed': 225,
        'learning_rate': 0.02,
        'lambda': 0.4, # l2正则化
        'reg_alpha': 0.4, # l1正则化
        'max_depth': 5, # 树的最大深度，但lgb是leaf-wise的
        'num_leaves': 64, # 叶子节点数
        'bagging_fraction': 0.7, # 数据采样
        'bagging_freq' : 1,
        'colsample_bytree': 0.7 # 特征采样
}

In [24]:
%%time

m_lgb = lgb.train(params, train_set, num_boost_round = 2500 , early_stopping_rounds = 50, valid_sets = [val_set], verbose_eval = 20) 



Training until validation scores don't improve for 50 rounds
[20]	valid_0's rmse: 3.21894
[40]	valid_0's rmse: 2.90438
[60]	valid_0's rmse: 2.6708
[80]	valid_0's rmse: 2.5001
[100]	valid_0's rmse: 2.37751
[120]	valid_0's rmse: 2.28942
[140]	valid_0's rmse: 2.22704
[160]	valid_0's rmse: 2.18357
[180]	valid_0's rmse: 2.15244
[200]	valid_0's rmse: 2.12998
[220]	valid_0's rmse: 2.11422
[240]	valid_0's rmse: 2.10269
[260]	valid_0's rmse: 2.09416
[280]	valid_0's rmse: 2.08749
[300]	valid_0's rmse: 2.08278
[320]	valid_0's rmse: 2.07919
[340]	valid_0's rmse: 2.07641
[360]	valid_0's rmse: 2.07441
[380]	valid_0's rmse: 2.07233
[400]	valid_0's rmse: 2.07049
[420]	valid_0's rmse: 2.06896
[440]	valid_0's rmse: 2.06753
[460]	valid_0's rmse: 2.06645
[480]	valid_0's rmse: 2.06539
[500]	valid_0's rmse: 2.06471
[520]	valid_0's rmse: 2.0638
[540]	valid_0's rmse: 2.0631
[560]	valid_0's rmse: 2.06218
[580]	valid_0's rmse: 2.0611
[600]	valid_0's rmse: 2.06044
[620]	valid_0's rmse: 2.05989
[640]	valid_0's rm

In [25]:
#m_lgb.save_model("model.lgb")

<lightgbm.basic.Booster at 0x7f3dd4445080>

In [26]:
%%time

alphas = [1.028, 1.023, 1.018]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha * m_lgb.predict(tst) # magic multiplier by kyakovlev



    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()
#     te_sub.loc[te.date >= fday+ timedelta(days=h), "id"] = te_sub.loc[te.date >= fday+timedelta(days=h), 
#                                                                           "id"].str.replace("validation$", "evaluation")
    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)


sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("evaluation", "validation")
sub = pd.concat([sub, sub2], axis = 0, sort = False)
sub.to_csv("submission4.csv",index=False)

Mem. usage decreased to 147.54 Mb (29.8% reduction)
0 2016-05-23 00:00:00
1 2016-05-24 00:00:00
2 2016-05-25 00:00:00
3 2016-05-26 00:00:00
4 2016-05-27 00:00:00
5 2016-05-28 00:00:00
6 2016-05-29 00:00:00
7 2016-05-30 00:00:00
8 2016-05-31 00:00:00
9 2016-06-01 00:00:00
10 2016-06-02 00:00:00
11 2016-06-03 00:00:00
12 2016-06-04 00:00:00
13 2016-06-05 00:00:00
14 2016-06-06 00:00:00
15 2016-06-07 00:00:00
16 2016-06-08 00:00:00
17 2016-06-09 00:00:00
18 2016-06-10 00:00:00
19 2016-06-11 00:00:00
20 2016-06-12 00:00:00
21 2016-06-13 00:00:00
22 2016-06-14 00:00:00
23 2016-06-15 00:00:00
24 2016-06-16 00:00:00
25 2016-06-17 00:00:00
26 2016-06-18 00:00:00
27 2016-06-19 00:00:00
0 1.028 0.3333333333333333
Mem. usage decreased to 147.54 Mb (29.8% reduction)
0 2016-05-23 00:00:00
1 2016-05-24 00:00:00
2 2016-05-25 00:00:00
3 2016-05-26 00:00:00
4 2016-05-27 00:00:00
5 2016-05-28 00:00:00
6 2016-05-29 00:00:00
7 2016-05-30 00:00:00
8 2016-05-31 00:00:00
9 2016-06-01 00:00:00
10 2016-06-02 0

In [27]:
sub.head(10)

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_evaluation,0.875,0.797852,0.774414,0.830566,0.837402,0.963867,0.893555,0.813477,0.787598,0.805176,0.842773,1.007812,1.146484,1.128906,0.887695,0.76416,0.733398,0.758301,0.805664,1.018555,1.066406,0.796875,0.737305,0.758789,0.756836,0.805664,0.992676,0.961426
1,FOODS_1_001_CA_2_evaluation,0.813965,0.800781,0.742188,0.666016,0.802246,1.132812,0.920898,0.836914,0.703125,0.727539,0.772949,0.90332,1.058594,1.185547,0.811035,0.753906,0.744141,0.775879,0.884766,1.125977,1.019531,0.774414,0.724121,0.713867,0.676758,0.775879,1.003906,0.916992
2,FOODS_1_001_CA_3_evaluation,0.739258,0.715332,0.762695,0.736328,1.021484,1.166992,1.210938,1.064453,0.977539,1.037109,1.114258,1.019531,1.110352,1.085938,0.936035,0.926758,0.941895,1.005859,1.040039,1.235352,1.110352,0.932617,0.928711,0.939941,0.929199,1.004883,1.186523,1.145508
3,FOODS_1_001_CA_4_evaluation,0.273926,0.280762,0.327637,0.351318,0.326904,0.337158,0.325684,0.323486,0.322754,0.316406,0.300781,0.356934,0.369629,0.36084,0.337646,0.33374,0.329102,0.340576,0.369629,0.388184,0.40918,0.343262,0.336426,0.36377,0.356934,0.35791,0.395264,0.380371
4,FOODS_1_001_TX_1_evaluation,0.577148,0.952148,0.626953,0.775879,0.909668,0.88916,1.095703,0.921387,0.780273,0.833496,0.826172,0.884766,1.013672,0.994141,0.819824,0.790527,0.759766,0.805664,0.863281,1.024414,1.021484,0.842773,0.869629,0.882812,0.862793,0.946777,1.107422,1.083984
5,FOODS_1_001_TX_2_evaluation,0.368652,0.356445,0.371582,0.31665,0.342285,0.394531,0.451172,0.415527,0.397949,0.416504,0.416504,0.49585,0.52002,0.514648,0.439453,0.425293,0.440918,0.454346,0.486572,0.557129,0.551758,0.44751,0.446777,0.418701,0.39917,0.435791,0.525879,0.484131
6,FOODS_1_001_TX_3_evaluation,0.477539,0.44458,0.44458,0.407715,0.485352,0.520996,0.503906,0.452637,0.462402,0.451904,0.470215,0.494629,0.539062,0.539062,0.485107,0.458984,0.487061,0.482666,0.517578,0.59668,0.564941,0.477539,0.476318,0.448242,0.433594,0.464111,0.552734,0.533203
7,FOODS_1_001_WI_1_evaluation,0.520996,0.477539,0.48584,0.476318,0.538086,0.52002,0.51416,0.448975,0.42627,0.427246,0.461426,0.54248,0.599609,0.595703,0.46582,0.454102,0.471191,0.478271,0.540039,0.622559,0.605469,0.479492,0.470947,0.477051,0.48291,0.555664,0.616211,0.566406
8,FOODS_1_001_WI_2_evaluation,0.490723,0.510742,0.886719,0.704102,0.768555,0.775879,0.874023,0.662109,0.723633,0.736328,0.840332,0.964355,0.984863,0.984863,0.856445,0.791016,0.803711,0.827148,0.833496,0.899414,0.908203,0.750977,0.790527,0.849121,0.786133,0.853516,0.904297,0.894531
9,FOODS_1_001_WI_3_evaluation,0.337891,0.335449,0.33667,0.361572,0.375977,0.32666,0.305176,0.284424,0.313721,0.348145,0.341553,0.431641,0.401855,0.384277,0.345703,0.333252,0.335449,0.362305,0.38208,0.435791,0.426025,0.362305,0.379395,0.389893,0.367432,0.40918,0.399902,0.377197


In [28]:
sub.id.nunique(), sub["id"].str.contains("validation$").sum()

(60980, 30490)

In [29]:
sub.shape

(60980, 29)