In [1]:
import utils2
sales_util = utils2.SalesUtils('')

import pandas as pd 
import numpy as np
import time
import gc
import os
import pickle

from itertools import product

from collections import Counter
import re
from operator import itemgetter
from sklearn.preprocessing import LabelEncoder



In [2]:
%%time
input_path = '../data/'
working_path = '../working/'
d_parser = lambda x: pd.datetime.strptime(x,'%d.%m.%Y')
df_sales           = pd.read_csv(os.path.join(input_path, 'sales_train.csv'), parse_dates =["date"],date_parser=d_parser)
df_items           = pd.read_csv(os.path.join(input_path, 'items.csv'))
df_item_categories = pd.read_csv(os.path.join(input_path, 'item_categories.csv'))
df_shops           = pd.read_csv(os.path.join(input_path, 'shops.csv'))
df_test            = pd.read_csv(os.path.join(input_path, 'test.csv'))

Wall time: 1min 44s


In [3]:
def down_cast_dataframe(df_matrix):
    df_matrix['date_block_num'] = df_matrix['date_block_num'].astype('int8')
    df_matrix['shop_id'] = df_matrix['shop_id'].astype('int8')
    df_matrix['item_id'] = df_matrix['item_id'].astype('int16')
    df_matrix['target'] = df_matrix['target'].astype('float16')
    df_matrix['target_shop'] = df_matrix['target_shop'].astype('float16')
    df_matrix['target_item'] = df_matrix['target_item'].astype('float16')
    df_matrix['month'] = df_matrix['month'].astype('int8')
    df_matrix['item_category_id'] = df_matrix['item_category_id'].astype('int8')
    df_matrix['parent_cat_id'] = df_matrix['parent_cat_id'].astype('int8')
    df_matrix['city_id'] = df_matrix['city_id'].astype('int8')
    df_matrix['num_days'] = df_matrix['num_days'].astype('int8')
    df_matrix['num_sun'] = df_matrix['num_sun'].astype('int8')
    df_matrix['num_sat'] = df_matrix['num_sat'].astype('int8')
    df_matrix['name_2'] = df_matrix['name_2'].astype('int16')
    df_matrix['name_3'] = df_matrix['name_3'].astype('int16')

    # getting the mean attributes
    mean_enc_cols = [col for col in df_matrix.columns if 'mean' in str(col)]
    for col in mean_enc_cols:
        df_matrix[col] = df_matrix[col].astype('float16')

    # getting the mean attributes
    revenue_cols = [col for col in df_matrix.columns if 'revenue' in str(col)]
    for col in revenue_cols:
        df_matrix[col] = df_matrix[col].astype('float32')
    return df_matrix

# Train Set Pipeine-1
Cleaning, Imputation, Outliers, Merging, Features etc

In [4]:
%%time
############ DELETING OUTLIERS 
# deleting the quantities which are greater than 1001
df_sales = df_sales[df_sales['item_cnt_day']<=1001]
#any item_cnt_day which is less than 0 should be 0
df_sales.loc[df_sales.item_cnt_day < 1,'item_cnt_day'] = 0 

# values more than 55k could be outliers, so deleting all above 55k
df_sales = df_sales[df_sales['item_price']<= 55000]

#item price, should obviously not be less than 0 ... either it should be deleted or imputed.
df_sales = df_sales[df_sales['item_price'] > 0]

############ ADDING DATE ATTRIBUTES
# Adding the date time attributes (like week day, month number, etc.)
df_sales = sales_util.add_date_attributes(df_sales)

############ REPLACING DUPLICATE SHOPS FROM SALES
# based on the above, duplicating as follows
df_sales['shop_id'].replace({0: 57, 1: 58, 11: 10, 40: 39}, inplace=True)

############ REMOVING OUTDATED SHOPS FROM SALES
outdated_shops = [0, 1, 8, 11, 13, 17, 23, 29, 30, 32, 33, 40, 43, 54]
df_sales = df_sales[df_sales['shop_id'].isin(outdated_shops)==False]

############ MERGING WITH THE SALES AND SHOPS AND ITEMS/ITEM_CATEGORIES
df_sales = sales_util.merge_items_sales_n_shops(df_sales)

############ CREATING THE MONTHLY REVENUE
df_sales['revenue'] = df_sales['item_cnt_day'] * df_sales['item_price']

Wall time: 10.7 s


# Test Set Pipeine-1
Cleaning, Imputation, Outliers, Merging, Features etc

In [5]:
%%time
#1. test data frame, adding date_block_num and month
df_test['date_block_num'] = 34
df_test['month'] = 11

############ REPLACING DUPLICATE SHOPS FROM SALES
# based on the above, duplicating as follows
df_sales['shop_id'].replace({0: 57, 1: 58, 11: 10, 40: 39}, inplace=True)

#2. merging, just like the sales
df_test = sales_util.merge_items_sales_n_shops(df_test)

############ REPLACING DUPLICATE CATEGORY
# duplicate category id
df_test['item_category_id'].replace({8: 80, 27: 74, 75: 76}, inplace=True)

Wall time: 993 ms


# Train Set Pipeine-2
Advanced feature generation, monthly grouping, merge with the Test set

In [6]:
%%time

############ DAYS IN A MONTH BY DATE_BLOCK_NUM
# getting the number of days, and sundays by date_block_num
days_counts = sales_util.get_days_count()

############ MATRIX CONVERSION BY MONTH SHOP_ID X ITEM_ID for each DATE_BLOCK_NUM
df_matrix = sales_util.get_matrix_by_block(df_sales)

############ MERGING WITH THE SALES AND SHOPS AND ITEMS/ITEM_CATEGORIES
'''
This needs to be performed again, since many of the columns while converting 
to matrix will be ignored
'''
df_matrix = sales_util.merge_items_sales_n_shops(df_matrix)

############ DELETE THE UNWANTED COLUMNS ONCE
df_matrix.drop(['item_category_name','item_cat_en','parent_cat','city_name'], axis=1, inplace=True)
df_test.drop(['item_category_name','item_cat_en','parent_cat','city_name'], axis=1, inplace=True)

############ CONCATING BOTH TEST AND TRAIN(SALES)
df_matrix = pd.concat([df_matrix, df_test], axis=0)
df_matrix = df_matrix.drop(columns = ['ID'])
df_matrix.fillna(0,inplace=True)

############ JOIN THE NUM_DAYS
df_matrix = df_matrix.merge(days_counts, how='inner')

############ DOWN CASTING 
df_matrix = down_cast_dataframe(df_matrix)

############ ADDING THE MEANS
'''
#adding the mean attributes

1: expanding mean by shop id
2: shop/item target mean
3: item id target mean
4: month target mean
5: parent cat target mean
6: item category target mean
7: shop id target mean
8: city id target mean
9: shop_city target mean
10: date_block_num target mean

'''
means_to_be_used = [1,2,3,4,5,6,7,8,9,10]
df_matrix = sales_util.add_mean_features(df_matrix, means_to_be_used)
df_matrix.fillna(0, inplace=True)
gc.collect()

############ DOWN CASTING 
df_matrix = down_cast_dataframe(df_matrix)
gc.collect()


############ ADDING LAGS
mean_enc_cols = [col for col in df_matrix.columns if 'mean' in str(col)]
# removing the mean cols temporarily so that memory usage doesn't get high
dftmp = df_matrix[mean_enc_cols + ['shop_id','item_id','date_block_num']]
df_matrix.drop(mean_enc_cols,axis=1, inplace=True)
shift_range = [1,2,3,4,12]
# additional columns, not to be counted when calculating lags
except_cols = ['num_days','num_sat','num_sun','name_2','name_3','city_id','parent_cat_id','item_category_id','month'] 
df_matrix = sales_util.add_lags(df_matrix, shift_range, except_cols)
# adding back the mean cols again
df_matrix = df_matrix.merge(dftmp, how='inner', on=['shop_id','item_id','date_block_num'])

############ REMOVING THE DATA FROM BEFORE 2013 
df_matrix = df_matrix[df_matrix.date_block_num > 3]

############ SAVINTG THE DATA IN WORKING DIRECTORY
df_matrix.to_csv(os.path.join(working_path, 'df_main_with_test.csv'), index=False)

HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Wall time: 17min 47s
