In [None]:
# load data
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
plt.style.use('ggplot') 

items  = pd.read_csv('../input/items.csv')
train = pd.read_csv('../input/sales_train.csv')
test = pd.read_csv('../input/test.csv')
item_category = pd.read_csv('../input/item_categories.csv')
shops = pd.read_csv('../input/shops.csv')

DEBUG = False


def eda(data):
    print("----------Top-5- Record----------")
    print(data.head(5))
    print("-----------Information-----------")
    print(data.info())
    print("-----------Data Types-----------")
    print(data.dtypes)
    print("----------Missing value-----------")
    print(data.isnull().sum())
    print("----------Null value-----------")
    print(data.isna().sum())
    print("----------Shape of Data----------")
    print(data.shape)

def graph_insight(data):
    if DEBUG:
        print(set(data.dtypes.tolist()))
    df_num = data.select_dtypes(include = ['float64', 'int64'])
    df_num.hist(figsize=(16, 16), bins=50, xlabelsize=8, ylabelsize=8)
    
def remove_duplicate(data, subset):
    if DEBUG:
        print('Before drop shape:', data.shape)
    
    before = data.shape[0]
    data.drop_duplicates(subset, keep='first', inplace=True) #subset is list where you have to put all column for duplicate check
    data.reset_index(drop=True, inplace=True)
    
    if DEBUG:
        print('After drop shape:', data.shape)
    
    after = data.shape[0]
    
    if DEBUG:
        print('Total Duplicate:', before-after)

    
def remove_unreasonable_data(data):
    size = 50

    if DEBUG:
        print("Max of item price records:",data.nlargest(size, 'item_price'))
        print("Min of item price records:",data.nsmallest(size, 'item_price'))
    
        print("Max of item cnt day records:",data.nlargest(size, 'item_cnt_day'))
        print("Min of item cnt day records:",data.nsmallest(size, 'item_cnt_day'))
    
    # remove item_price <= 0 and item_price > 300000
    # item_cnt_day < 0 means customers return the sold goods.
    error_train_data = data[(data.item_price <= 0) | (data.item_price == None) | (data.item_price > 300000)]
    data.drop(error_train_data.index, inplace=True)

print("Rows to train")
print(train.count())
item_category.head(5)
train.head(5)

In [None]:
items.head(5)
train.head(5)
item_category.head(5)
shops.head(5)

train.head(5)

In [None]:
items.head(5)

In [None]:
item_category.head(5)

# 1. Sales Train Data Cleaning

In [None]:
if DEBUG:
    eda(train)

# Drop Duplicate Data
subset = ['date', 'date_block_num', 'shop_id', 'item_id', 'item_cnt_day', 'item_price']

remove_duplicate(train, subset = subset)
remove_unreasonable_data(train)

train = pd.merge(train, items, how='outer', on='item_id')
train.head(4)

del train['item_name']
# del train['item_category_id_x']
# del train['item_category_id_y']
# del train['item_name_x']
# del train['item_name_y']
# result = train.drop(['item_name'], axis=1)
# result.head(4)

train.head(4)
train.to_csv('result.csv')

# 2. Sales Per Month Count

In [None]:
train.head(20)

test_df = train.groupby(['date_block_num', 'shop_id', 'item_id'], as_index=False).agg({'item_cnt_day': 'sum'})

#test_df.head(100)
# test_df.to_csv('test.csv')

# Distribution Checking

In [None]:
count_price = train.item_price.value_counts().sort_index(ascending=False)
plt.subplot(221)
count_price.hist(figsize=(20,6))
plt.xlabel('Item Price', fontsize=20)
plt.title('Original Distiribution')

# log1p() and exmp1()
# log1p = log（x+1）
# log1p 可以让数据更加平滑，更服从高斯分布，方便后面更好地进行分类
# 也可以避免复值问题（一个自变量对应多个因变量）

# 由于我们使用了 log1p()，对数据进行了压缩，所以后面还需要再进行一次 expm1() 的逆运算

plt.subplot(222)
train.item_price.map(np.log1p).hist(figsize=(20,6))
plt.xlabel('Item Price')
plt.title('log1p Transformation')
train.loc[:,'item_price'] = train.item_price.map(np.log1p)

In [None]:
count_price = train.date_block_num.value_counts().sort_index(ascending=False)
plt.subplot(221)
count_price.hist(figsize=(20,5))
plt.xlabel('Date Block')
plt.title('Original Distiribution')

count_price = train.shop_id.value_counts().sort_index(ascending=False)
plt.subplot(222)
count_price.hist(figsize=(20,5))
plt.xlabel('shop_id')
plt.title('Original Distiribution')

count_price = train.item_id.value_counts().sort_index(ascending=False)
plt.subplot(223)
count_price.hist(figsize=(20,5))
plt.xlabel('item_id')
plt.title('Original Distiribution')



# Map the Items

In [None]:
l = list(item_category.item_category_name)
l_cat = l


for ind in range(1,8):
    l_cat[ind] = 'Access'

for ind in range(10,18):
    l_cat[ind] = 'Consoles'

for ind in range(18,25):
    l_cat[ind] = 'Console Games'

for ind in range(26,28):
    l_cat[ind] = 'Mobile games'

for ind in range(28,32):
    l_cat[ind] = 'CD games'

for ind in range(32,37):
    l_cat[ind] = 'Card'

for ind in range(37,43):
    l_cat[ind] = 'Movie'

for ind in range(43,55):
    l_cat[ind] = 'Books'

for ind in range(55,61):
    l_cat[ind] = 'Music'

for ind in range(61,73):
    l_cat[ind] = 'Gifts'

for ind in range(73,79):
    l_cat[ind] = 'Soft'


item_category['cats'] = l_cat
item_category.head()

# Convert Date Column data type from object to Date 

In [None]:
train['date'] = pd.to_datetime(train.date, format="%d.%m.%Y")
train.head()

In [None]:
# 透视表, 对数据动态排布并且分类汇总
p_df = train.pivot_table(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day', aggfunc='sum').fillna(0.0)
p_df.head(50)

In [None]:
## 重建索引，所以需要把 str -> int

train_cleaned_df = p_df.reset_index()
train_cleaned_df['shop_id']= train_cleaned_df.shop_id.astype('str')
train_cleaned_df['item_id']= train_cleaned_df.item_id.astype('str')

item_to_cat_df = items.merge(item_category[['item_category_id','cats']], how="inner", on="item_category_id")[['item_id','cats']]
item_to_cat_df[['item_id']] = item_to_cat_df.item_id.astype('str')

train_cleaned_df = train_cleaned_df.merge(item_to_cat_df, how="inner", on="item_id")

# Encode Categories
from sklearn import preprocessing

number = preprocessing.LabelEncoder()
train_cleaned_df[['cats']] = number.fit_transform(train_cleaned_df.cats)
# 一共是34个月的数据，所以后面加了 list(range(34))
train_cleaned_df = train_cleaned_df[['shop_id', 'item_id', 'cats'] + list(range(34))]
train_cleaned_df.head()

# Model Building

In [None]:
import xgboost as xgb
param = {'max_depth':10, 
         'subsample':1,
         'min_child_weight':0.5,
         'eta':0.3, 
         'num_round':1000, 
         'seed':1,
         'silent':0,
         'eval_metric':'rmse'}

progress = dict()
xgbtrain = xgb.DMatrix(train_cleaned_df.iloc[:,  (train_cleaned_df.columns != 33)].values, train_cleaned_df.iloc[:, train_cleaned_df.columns == 33].values)
watchlist  = [(xgbtrain,'train-rmse')]

bst = xgb.train(param, xgbtrain)
preds = bst.predict(xgb.DMatrix(train_cleaned_df.iloc[:,  (train_cleaned_df.columns != 33)].values))
from sklearn.metrics import mean_squared_error 
rmse = np.sqrt(mean_squared_error(preds,train_cleaned_df.iloc[:, train_cleaned_df.columns == 33].values))
print(rmse)

In [None]:
xgb.plot_importance(bst)

In [None]:
apply_df = test
apply_df['shop_id']= apply_df.shop_id.astype('str')
apply_df['item_id']= apply_df.item_id.astype('str')

apply_df = test.merge(train_cleaned_df, how = "left", on = ["shop_id", "item_id"]).fillna(0.0)
apply_df.head()

In [None]:
# Move to one month front
d = dict(zip(apply_df.columns[4:],list(np.array(list(apply_df.columns[4:])) - 1)))

apply_df  = apply_df.rename(d, axis = 1)

print('========')
train.count()
apply_df.count()

In [None]:
preds = bst.predict(xgb.DMatrix(apply_df.iloc[:, (apply_df.columns != 'ID') & (apply_df.columns != -1)].values))

In [None]:
# Normalize prediction to [0-20]
preds = list(map(lambda x: min(20,max(x,0)), list(preds)))
sub_df = pd.DataFrame({'ID':apply_df.ID, 'item_cnt_month': preds, 'shop_id': apply_df.shop_id, 'item_id': apply_df.item_id })
sub_df.describe()

In [None]:
sub_df.to_csv('Submission_Predict Sales.csv',index=False)