In [1]:
import pandas as pd
import numpy as np

In [2]:
# read data
calendar_df = pd.read_csv('../data/raw/calendar.csv')
sell_prices_df = pd.read_csv('../data/raw/sell_prices.csv')
sales_train_val_df = pd.read_csv('../data/raw/sales_train_validation.csv')
#submission_df = pd.read_csv('../data/raw/sample_submission.csv')

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))

In [None]:
calendar_df['event_name_1'].value_counts(dropna = False)

In [4]:
calendar_df.shape

(1969, 14)

In [5]:
sell_prices_df.shape

(6841121, 4)

In [6]:
sales_train_val_df.shape

(30490, 1919)

In [7]:
#submission_df.shape

In [8]:
60980 /2

30490.0

In [9]:
sales_train_val_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [10]:
id_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
sales_train_val_df = sales_train_val_df.melt(id_vars= id_cols,
                               var_name= 'days',
                               value_name = 'units_sold')
sales_train_val_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,days,units_sold
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


In [11]:
sales_train_val_df["days"] = sales_train_val_df["days"].apply(lambda x: int(x.split("_")[1]))

In [12]:
# creating id column in sell_prices
sell_prices_df["id"] = sell_prices_df["item_id"] + "_" + sell_prices_df["store_id"] + "_validation"

In [13]:
# 1969 - 28 # no use in may, test set for june

# 1941 - 28 # test set for may, validation for june

# 1913 - 28 # validation for may

In [14]:
reduce_mem_usage(sales_train_val_df)

Mem. usage decreased to 2892.52 Mb (18.7% reduction)


In [15]:
sales_train_val_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,days,units_sold
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0


In [16]:
sell_prices_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
0,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation
1,CA_1,HOBBIES_1_001,11326,9.58,HOBBIES_1_001_CA_1_validation
2,CA_1,HOBBIES_1_001,11327,8.26,HOBBIES_1_001_CA_1_validation
3,CA_1,HOBBIES_1_001,11328,8.26,HOBBIES_1_001_CA_1_validation
4,CA_1,HOBBIES_1_001,11329,8.26,HOBBIES_1_001_CA_1_validation


In [17]:
calendar_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [18]:
calendar_df["days"] = calendar_df["d"].apply(lambda x: int(x.split("_")[1]))

In [19]:
calendar_sell_prices_merged = sell_prices_df.merge(calendar_df, 
                                                   on = ['wm_yr_wk'], how = 'inner')

In [20]:
calendar_sell_prices_merged.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id,date,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,days
0,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation,2013-07-13,Saturday,1,7,2013,d_897,,,,,0,1,0,897
1,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation,2013-07-14,Sunday,2,7,2013,d_898,,,,,0,0,1,898
2,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation,2013-07-15,Monday,3,7,2013,d_899,,,,,0,1,1,899
3,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation,2013-07-16,Tuesday,4,7,2013,d_900,,,,,0,0,0,900
4,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation,2013-07-17,Wednesday,5,7,2013,d_901,,,,,0,0,0,901


In [21]:
calendar_sell_prices_merged["days"] = calendar_sell_prices_merged["d"].apply(lambda x: int(x.split("_")[1]))

In [22]:
sales_train_sell_prices_merged = sales_train_val_df.merge(calendar_sell_prices_merged[["id", "wm_yr_wk"]], 
                                                          on=, how='inner')

KeyError: ['id', 'wm_yr_wk']