In [1]:
import sys
print(sys.version)

3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 11:07:29) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [2]:
import datetime
import sqlite3
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns

from scipy.stats import variation

#set max columns displayed to 100
pd.set_option('display.max_columns',100)

### IMPORT ALL DATASETS 

In [52]:
sales = pd.read_csv('./Data/competitive-data-science-predict-future-sales/sales_train.csv')

### PERFORM INITIAL DATA CLEANING 

In [53]:
# convert the date column from string to datetime type
sales.date = sales.date.apply(lambda x: datetime.datetime.strptime(x, '%d.%m.%Y'))

In [54]:
# Drop duplicate rows
sales.drop_duplicates(inplace=True)

In [55]:
# Identify duplicate rows by shop-item-date
dupes = sales.loc[sales.duplicated(subset=['shop_id','item_id','date'], keep=False), :]
dupes.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
6959,2013-01-29,0,25,12133,889.0,1.0
6960,2013-01-29,0,25,12133,1389.0,1.0
70686,2013-01-25,0,31,14050,349.0,1.0
70718,2013-01-25,0,31,14050,248.0,1.0
103461,2013-01-27,0,38,15702,549.0,1.0


In [56]:
# Check how often quantity sold was not 1
dupes[dupes.item_cnt_day != 1.0]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
103463,2013-01-27,0,38,15702,149.0,-1.0
1154620,2013-12-29,11,27,12133,669.0,2.0
2272100,2014-12-26,23,17,3424,999.0,2.0
2456815,2015-02-17,25,5,21619,332.66,-1.0


In [57]:
# Look into shop-item-date combos with multiple quantities when one quantity was negative
dupes[(dupes.shop_id == 38) & (dupes.item_id == 15702)]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
103461,2013-01-27,0,38,15702,549.0,1.0
103463,2013-01-27,0,38,15702,149.0,-1.0


In [58]:
dupes[(dupes.shop_id == 5) & (dupes.item_id == 21619)]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2456813,2015-02-17,25,5,21619,499.0,1.0
2456815,2015-02-17,25,5,21619,332.66,-1.0


In [59]:
# Just remove these two pairs of shop-item-dates
dupes = dupes[~((dupes.shop_id == 38) & (dupes.item_id == 15702)) & ~((dupes.shop_id == 5) & (dupes.item_id == 21619))]

In [60]:
# combine remaining shop-item-date-price level values into shop-item-date level values
# by summing the quantity sold and taking the weighted average of price (weighted by quantity)

# Define a lambda function to compute the weighted mean:
wm = lambda x: np.average(x, weights=dupes.loc[x.index, "item_cnt_day"])

dupes = dupes.groupby(['shop_id','item_id','date','date_block_num']).agg({'item_cnt_day':'sum', 'item_price': wm}).reset_index()

In [61]:
dupes.head()

Unnamed: 0,shop_id,item_id,date,date_block_num,item_cnt_day,item_price
0,12,21619,2015-10-07,33,2.0,449.0
1,16,12133,2013-03-02,2,2.0,1139.0
2,16,15702,2013-02-18,1,2.0,349.0
3,17,3424,2014-12-26,23,3.0,1065.666667
4,25,12133,2013-01-29,0,2.0,1139.0


In [62]:
# remove the manipulated rows from the original dataframe
sales.drop_duplicates(subset=['shop_id','item_id','date'], keep=False, inplace=True)

# insert the new version of those rows back into the original dataframe
sales = pd.concat([sales, dupes], axis=0, sort=True).reset_index(drop=True)

In [65]:
# remove row with negative price
sales = sales[sales.item_price > 0.]

In [66]:
sales.shape

(2935818, 6)

### SHOP-LEVEL FEATURES 

### ITEM-LEVEL FEATURES

In [67]:
# Calculate the coefficient of variation of price for each item separately
item_level_features = (sales.groupby('item_id')['item_price'].agg(variation)
                       .reset_index().rename(columns={'item_price':'coef_var_price'}))

In [70]:
# Calculate the mean absolute deviation of quantity sold for each item
item_level_features['quant_mean_abs_dev'] = sales.groupby('item_id')['item_cnt_day'].mad().values

### DATE-LEVEL FEATURES

### ITEM-DATE-LEVEL FEATURES

### SHOP-DATE-LEVEL FEATURES

### SHOP-ITEM-LEVEL FEATURES 

### SHOP-ITEM-DATE-LEVEL FEATURES