# Future Sales Prediction (0p98)

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline 

from itertools import product
import gc

from sklearn import preprocessing
import sklearn
import lightgbm as lgb

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

Data files are loaded

In [None]:
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
item_cats = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

## EDA

### Test
In the test file there are the "shop_id" and "item_id" for which we have to predict, for the "date_block_num" (month) 34 and for each shop, the number of items sold

In [None]:
test.head(5)

### Sales

In [None]:
sales.head(5)

1. A file that adds test data (with target set to 0) is added to the sales history
2. Negative "item_cnt_day" correspond to returns, so these rows are deleted
3. If there is a negative price, it is a wrong product etry and it is deleted
4. The predictions are month based and correspond only to the items sold. Therefore date and price are dropped.

In [None]:
# 1. A file that adds test data (with target set to 0) is added to the sales history
sales = pd.read_csv('../input/salesprediction/sales_train_test.csv')
# 2. Negative counts are returned objects and they are neglected
sales.drop(sales[sales['item_cnt_day'] < 0].index, inplace=True)
# 3. Negative price is an error and is neglegted
sales.drop(sales[sales['item_price'] < 0].index, inplace=True)
# 4. Irrelevant columns are deleted
sales = sales.drop(columns = ['date', 'item_price'])

Month is added as a feature to capture seasonal trend

In [None]:
sales['month'] = sales['date_block_num'].values % 12 + 1

### Items
No further action is performed with the items information

In [None]:
print('There are ' + str(len(items)) + ' different items')
items.head(5)

### Item categories

In [None]:
print('There are ' + str(len(item_cats)) + ' different items')
item_cats.head(5)

There are 84 different item categories. Each item belongs to an item category. 
Thanks to google translate, names are translated to english and 3 more features have been implemented:
 - cat_1 and cat_2: subcategories of the item category
 - cat_digital: if the item category is digital has a value of 1 and, if not, a value of 0

In [None]:
item_cats = pd.read_csv('../input/salesprediction/item_categories_extra.csv', sep=';')
item_cats.head(5)

### Shops

In [None]:
print('There are ' + str(len(shops)) + ' different shops')
shops.head(5)

In a first step are observed the cumulative sold items per month for each shop

In [None]:
# All Shop sales per month
shopsSales = sales.groupby(['shop_id', 'date_block_num'], as_index=False).agg(shop_month_sales = ('item_cnt_day', 'sum'))
fig = plt.figure()
ax = fig.add_subplot()
for shopId in sales['shop_id'].unique():
    shopSales = shopsSales[shopsSales['shop_id'] == shopId]
    ax.plot(shopSales['date_block_num'], shopSales['shop_month_sales'])
ax.set_xlabel('date_block_num')
ax.set_ylabel('shop_month_sales')

As shown in the previous figure, not all shops are active in all months. Therefore, all shops that do not sell any item in the last month wilol not be considered as it is assumed they are closed

In [None]:
minMonth = 33
closedShopIds = []
for shopId in sales['shop_id'].unique():
    shopMonths = shopsSales[shopsSales['shop_id'] == shopId]['date_block_num'].unique()
    if np.amax(shopMonths) < minMonth:
        closedShopIds.append(shopId)

print('There are ' + str(len(closedShopIds)) + ' closed shops at the last month')

Similarly as for the items categories, with the help of google translate, shops name has been translated to english.
Some information is obtained and two features are added:
- shop_type: type of shop according to the name. For example shopping center, online, ...
- shop_zip: in the shop name, the city is included. Instead of the city name, the city zip code is considered as feature because near cities have similar zip codes

A part from that, there are 3 shops which are duplicated. Therefore the sales dataframe is updated

In [None]:
# Duplicated shops are merged
sales.loc[sales.shop_id == 0, 'shop_id'] = 57
sales.loc[sales.shop_id == 1, 'shop_id'] = 58
sales.loc[sales.shop_id == 11, 'shop_id'] = 10

shops = pd.read_csv('../input/salesprediction/shops_extra.csv', sep=';')
shops.head(5)

### Feature Matrix
A feature matrix with all possible combinations of items sold per each shop is created. All additional features will be added to this matrix

In [None]:
# First month considered to calculate the previous month sells
firstMonth = 20 
# Initial features columns names
featCols = ['shop_id', 'item_id', 'date_block_num']
# Shop ids to be kept
shopIds = list(np.setdiff1d(sales['shop_id'].unique(), closedShopIds))
# All months are considered because the solds in the previous months are necessary
shopMonths = sales['date_block_num'].unique() 

# Feature matrix is initialized
featMat = []

for shopId in shopIds:
    shopItems = sales[(sales['date_block_num'] >= firstMonth) & (sales['shop_id'] == shopId)]['item_id'].unique()
    featMat.append(np.array(list(product(*[[shopId], shopItems, shopMonths])),dtype='int32'))
# Turn the grid into a dataframe
featMat = pd.DataFrame(np.vstack(featMat), columns = featCols,dtype=np.int32)    
featMat.head(3)

#### Target
The target is, for each item, shop and month, the total number of articles sold. Target is calculated

In [None]:
# Sold items for each shop and month are added. It is the target column
targetMat = sales.groupby(['shop_id', 'date_block_num', 'item_id'], as_index=False).agg(target = ('item_cnt_day', 'sum'))
featMat = pd.merge(featMat, targetMat, how='left', on=featCols).fillna(0)
fig, ax = plt.subplots(figsize=(15, 6), nrows=1, ncols=2)
ax[0].hist(featMat['target'], 100)
ax[0].set_ylabel('occurrencies')
ax[0].set_xlabel('target')
ax[1].hist(featMat['target'], 100)
ax[1].set_ylabel('occurrencies')
ax[1].set_xlabel('target')
ax[1].set_ylim([0, 1000])

As shown, for most of the items only a few articles are sold and in few cases many articles are sold. TIt meanse there is a low probability that for an item, many articles are sold. Therefore, instead of scaling the target between 0 and 20, target is clipped between these values (see https://www.kaggle.com/code/zhixx018/coursera-final-0-98-lgbm/notebook). This way the few occurrencies with many articles sold will not impact the predictions.

In [None]:
# Solds are clipped
featMat['target'] = featMat['target'].clip(0, 20).astype(np.int32)

#### Previously described features are added

In [None]:
# Month is added as a feature to capture seasonal trend
featMat['month'] = (featMat['date_block_num'].values % 12 + 1).astype(np.int8)

# Item_category id is added as a feature
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
featMat = pd.merge(featMat, item_category_mapping, how='left', on='item_id').astype(np.int32)

# Item category features are added
# Categorical features cat_1 and cat_2 are encoded
le = preprocessing.LabelEncoder()
cat_1_2 = np.concatenate((item_cats['cat_1'], item_cats['cat_2']))
le.fit(cat_1_2)
item_cats['cat_1'] = le.transform(item_cats['cat_1']).astype(np.int32)
item_cats['cat_2'] = le.transform(item_cats['cat_2']).astype(np.int32)
item_cats.drop(columns=['item_category_name1', 'item_category_name2'], inplace=True)
featMat = pd.merge(featMat, item_cats, how='left', on='item_category_id')

# Shop features are added
shops['shop_type'] = le.fit_transform(shops['shop_type']).astype(np.int32)
shops.drop(columns=['shop_name', 'City'], inplace=True)
featMat = pd.merge(featMat, shops, how='left', on='shop_id')

del item_cats
del items
del shops
del sales

#### Additional features
Additional features are added:
- itemMonth: Total item solds per month
- shopMonth: Total solds per shop each month

In [None]:
# Item solds per month are defined as feature
itemMat = featMat.groupby(['date_block_num', 'item_id'], as_index = False).agg(itemMonth = ('target', 'sum')).astype(np.int32)
# Total solds per shop each month is added as feature
shopMat = featMat.groupby(['date_block_num', 'shop_id'], as_index = False).agg(shopMonth = ('target', 'sum')).astype(np.int32)

# New features are merged with the feature matrix
featMat = pd.merge(featMat, itemMat, how='left', on=['date_block_num', 'item_id']).fillna(0)
featMat = pd.merge(featMat, shopMat, how='left', on=['date_block_num', 'shop_id']).fillna(0)

# Downcasting and cleaning is performed
featMat = downcast_dtypes(featMat)

#### Lagged features
For some features, their value at previous months are added as new features

In [None]:
# Previous months are considered
prevCols = ['target', 'shopMonth', 'itemMonth']
lastMonth = np.amax(featMat['date_block_num'].unique())
# Number of previous months to be considered
prevMonths = range(1, 13, 1)

# New columns are added
for prevCol in prevCols:
    for prevMonth in prevMonths:
        featMat[prevCol + '_' + str(prevMonth)] = np.zeros(len(featMat))

for month in range(firstMonth, lastMonth + 1, 1):
    for prevCol in prevCols:
        for prevMonth in prevMonths:        
            featMat.loc[featMat['date_block_num'] == month, prevCol + '_' + str(prevMonth)] = \
            featMat.loc[featMat['date_block_num'] == month - prevMonth, prevCol].values

# Once featMat is filled, not necessary months are droped
featMat.drop(featMat.loc[featMat['date_block_num'] < firstMonth].index, inplace=True)

# Downcasting and cleaning is performed
featMat = downcast_dtypes(featMat)
gc.collect();

featMat.head(15)

#### Feature matrix is saved / loaded

In [None]:
featMat.to_csv('feature_matrix.csv', index = False)
# featMat = pd.read_csv('feature_matrix.csv')

## Predictions calculation

### Train / Test split
For a sake of the programming assignment, let's artificially split the data into train and test. We will treat last month data as the test set.

In [None]:
dates = featMat['date_block_num']

last_block = dates.max()
print('Test `date_block_num` is %d' % last_block)

Features related to the current month solds are deleted because for the month at which predictions are done, their value is 0

In [None]:
dates_train = dates[dates <  last_block]
dates_test  = dates[dates == last_block]

to_drop_cols_mod = ['target', 'date_block_num', 'itemMonth', 'shopMonth']
X_train = featMat.loc[dates <  last_block].drop(to_drop_cols_mod, axis=1)
X_test =  featMat.loc[dates == last_block].drop(to_drop_cols_mod, axis=1)

y_train = featMat.loc[dates <  last_block, 'target'].values
y_test =  featMat.loc[dates == last_block, 'target'].values

### Model
lightGBM

In [None]:
lgb_params = {
               'feature_fraction': 0.5,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.5, 
               'learning_rate': 0.05, 
               'objective': 'rmse', 
               'bagging_seed': 2**7, 
               'num_leaves': 80,
               'bagging_freq':1,
               'verbose':0, 
               'lambda_l1': 0.05,
               'lambda_l2': 0.05
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = model.predict(X_test)

pred_lgb_scaled = np.clip(pred_lgb, 0, 20)

### Predictions file

In [None]:
subFileName = './submission.csv'
# Submissions example file is loaded and predictions values are replaced
subFile = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
featMat.loc[featMat['date_block_num'] == last_block, 'target'] = pred_lgb_scaled
test = pd.merge(test, featMat[featMat['date_block_num'] == last_block], how='left', on=['shop_id', 'item_id'])
subFile['item_cnt_month'] = test['target'].values
subFile.to_csv(subFileName, index=False)