In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import gc

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# load data
basicPath = r'../input/competitive-data-science-predict-future-sales/';
outputPath = ''

# sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.
#     date     dd.MM.yyyy
#     shop_id
#     item_id
#     item_price  - number of products sold. You are predicting a monthly amount of this measure
#     date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
df_train = pd.read_csv(os.path.join(basicPath, 'sales_train.csv'))
df_train['date'] = pd.to_datetime(df_train['date'])

# test.csv - the test set. You need to forecast the sales for these shops and products for November 2015 (monthly!)
df_pred  = pd.read_csv(os.path.join(basicPath, 'test.csv'))

#items.csv - supplemental information about the items/products.
df_items = pd.read_csv(os.path.join(basicPath, 'items.csv'))
#item_categories.csv  - supplemental information about the items categories.
df_categ = pd.read_csv(os.path.join(basicPath, 'item_categories.csv'))
#shops.csv- supplemental information about the shops.
df_shops = pd.read_csv(os.path.join(basicPath, 'shops.csv'))

In [None]:
print(df_train.describe())
print(df_items.describe())
print(df_train.dtypes)
print(df_pred.describe())

Notes:
1. There are incorrect item_price (<=0)
2. There are incorrect item_cnt_day (<=0)
3. As prediction result we need monthly data, but in input we have daily
4. In prediction data we have only shop_id and item_id, no price, so we can't use it for training
5. We can lookup addition data by shop_id and item_id

In [None]:
# remove incorrect prices and counts
df_train = df_train[df_train['item_price']>0]
df_train = df_train[df_train['item_cnt_day']>0]

In [None]:
# validation
duplicates = df_items[df_items.duplicated(['item_id'])]
print(duplicates)
duplicates = df_categ[df_categ.duplicated(['item_category_id'])]
print(duplicates)
duplicates = df_shops[df_shops.duplicated(['shop_id'])]
print(duplicates)
duplicates = df_train[df_train.duplicated(['date', 'shop_id', 'item_id'])]
print(duplicates)

# delete duplicates
df_train.drop_duplicates(subset=['date', 'shop_id', 'item_id'], keep='last', inplace=True)
duplicates = df_train[df_train.duplicated(['date', 'shop_id', 'item_id'])]
print(duplicates)

In [None]:
# now should be ok
print(df_train.describe())
print(df_train.dtypes)

In [None]:
# item_price distribution
var = 'item_price' 
data = pd.concat([df_train['date'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='date', ylim=(df_train['date'].min(), df_train['date'].max()));

In [None]:
# remove price anomalies
maxPriceValue = 100000
df_train = df_train[df_train['item_price']<maxPriceValue]

var = 'item_price' 
data = pd.concat([df_train['date'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='date', ylim=(df_train['date'].min(), df_train['date'].max()));

In [None]:
var = 'item_cnt_day' 
data = pd.concat([df_train['date'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='date', ylim=(df_train['date'].min(), df_train['date'].max()));

In [None]:
# remove count anomalies
maxCountValue = 1000
df_train = df_train[df_train['item_cnt_day']<maxCountValue]

var = 'item_cnt_day' 
data = pd.concat([df_train['date'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='date', ylim=(df_train['date'].min(), df_train['date'].max()));

In [None]:
# selling by shop
print(df_train['shop_id'].value_counts())

In [None]:
# fix shop id data
def fixShopId(df):
    df.loc[df['shop_id'] ==  0, 'shop_id'] = 57
    df.loc[df['shop_id'] ==  1, 'shop_id'] = 58
    df.loc[df['shop_id'] == 10, 'shop_id'] = 11
    df.loc[df['shop_id'] == 40, 'shop_id'] = 39
    df.loc[df['shop_id'] == 24, 'shop_id'] = 23

    df.loc[df['shop_id'] == 36, 'shop_id'] = 101 # --> Новосибирск
    df.loc[df['shop_id'] == 37, 'shop_id'] = 101 

    df.loc[df['shop_id'] == 6, 'shop_id'] = 102 # --> Воронеж
    df.loc[df['shop_id'] == 7, 'shop_id'] = 102
    df.loc[df['shop_id'] == 8, 'shop_id'] = 102

    df.loc[df['shop_id'] == 57, 'shop_id'] = 103 # --> Якутск
    df.loc[df['shop_id'] == 58, 'shop_id'] = 103 

    df.loc[df['shop_id'] == 34, 'shop_id'] = 104 # --> Н.Новгород
    df.loc[df['shop_id'] == 35, 'shop_id'] = 104 

    df.loc[df['shop_id'] == 17, 'shop_id'] = 105 # --> Красноярск
    df.loc[df['shop_id'] == 18, 'shop_id'] = 105
    
    return df

df_train = fixShopId(df_train)
df_shops = fixShopId(df_shops)
    
print(df_train['shop_id'].value_counts())

In [None]:
# try to extract city name from shop name
df_shops['shop_name'] = df_shops.apply(lambda x: x['shop_name'].strip('!'), axis=1)
df_shops['city'] = df_shops.apply(lambda x: x['shop_name'].split()[0], axis=1)

df_shops.loc[df_shops['city'] == 'Интернет-магазин', 'city'] = 'Интернет'
df_shops.loc[df_shops['city'] == 'Цифровой', 'city'] = 'Интернет'

cityList = list(set(list(df_shops['city'])))
cityList.sort()
print(cityList)

# remove duplicates
duplicates = df_shops[df_shops.duplicated(['shop_id', 'city'])]
print(duplicates)
df_shops.drop_duplicates(subset=['shop_id', 'city'], keep='last', inplace=True)
print(df_shops)

In [None]:
def addLookup(df):
    # add item_category_id
    df = df.join(df_items.set_index('item_id'), on='item_id')
    df.drop(['item_name'], axis=1, inplace=True)

    # add city name
    df = df.join(df_shops.set_index('shop_id'), on='shop_id')
    df.drop(['shop_name'], axis=1, inplace=True)
    
    df['city'] = df['city'].astype("category")
    df['item_category_id'] = df['item_category_id'].astype("category")
    
    df.drop(['shop_id'], axis=1, inplace=True)
    df.drop(['item_id'], axis=1, inplace=True)
    
    return df

df_train = addLookup(df_train)
print(df_train.head(20))

In [None]:
# remove all unused columns
df_train.drop(['item_price'], axis=1, inplace=True)
df_train.drop(['date'], axis=1, inplace=True)

print(df_train.head(20))
print(df_train.columns)

In [None]:
XGrouped = df_train.groupby(by=['date_block_num','item_category_id','city']).agg({'item_cnt_day': 'sum'}).reset_index()
XGrouped.rename(columns={"item_cnt_day": "item_cnt_month"}, inplace=True)
print(XGrouped.head(10))

In [None]:
# selling by date for one product
# any ideas? no, sorry
maxSell = set(XGrouped[ XGrouped['item_cnt_month'] > 6000 ]['item_category_id'])
print(maxSell)
data = XGrouped[ XGrouped['item_category_id'] == 40 ]
data = pd.concat([data['item_cnt_month'], data['date_block_num']], axis=1)
data.plot(x='date_block_num', y='item_cnt_month');

data = XGrouped[ XGrouped['item_category_id'] == 30 ]
data = pd.concat([data['item_cnt_month'], data['date_block_num']], axis=1)
data.plot(x='date_block_num', y='item_cnt_month');

data = XGrouped[ XGrouped['item_category_id'] == 19 ]
data = pd.concat([data['item_cnt_month'], data['date_block_num']], axis=1)
data.plot(x='date_block_num', y='item_cnt_month');

In [None]:
# date_block_num is seq numeration 0...33, but it's not useful for ML
# better to convert it in year of number 1..12

def convertMonth(df):
    df['month_num'] = df.apply(lambda x: (x['date_block_num'] % 12)+1, axis=1)
    df['month_num'] = df['month_num'].astype("category")
    df.drop(['date_block_num'], axis=1, inplace=True)
    return df

XGrouped = convertMonth(XGrouped)
print(XGrouped.head(10))

In [None]:
XGrouped = pd.get_dummies(XGrouped, drop_first=True)
print(XGrouped.head(10))

In [None]:
y = XGrouped[['item_cnt_month']]
X = XGrouped.copy()
X.drop(['item_cnt_month'], axis=1, inplace=True)

print(X.head())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=44, shuffle =True)
print('Shape:', X_train.shape, X_test.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# model
model = RandomForestRegressor(n_estimators=50, random_state=0, n_jobs=-1, max_features= 'log2')

# save columns    
model.feature_names = list(X_train.columns.values)    
    
# train
model.fit(X_train,  y_train['item_cnt_month'].values)

#score
print('Train score is : ' , model.score(X_train, y_train))
print('Test  score is : ' , model.score(X_test , y_test ))

In [None]:
# data for prediction

I_pred = df_pred['ID']
X_pred = df_pred.copy()
X_pred.drop(['ID'], axis=1, inplace=True)

X_pred['date_block_num'] = 34 #  November 2015
X_pred = fixShopId(X_pred)
X_pred = addLookup(X_pred)
X_pred = convertMonth(X_pred)

X_pred = pd.get_dummies(X_pred, drop_first=True)

for c in model.feature_names:
    if not(c in X_pred.columns):
        X_pred[c] = 0

X_pred['month_num_11'] = 1 #  November

print(len(X_train.columns))
print(len(X_pred.columns))
print(list(X_pred.columns))

In [None]:
# predictions
y_pred = model.predict(X_pred)
output = pd.DataFrame({'ID': I_pred, 'item_cnt_month': y_pred})
print(output.head(5))
output.to_csv(os.path.join(outputPath, 'my_submission.csv'), index=False)
print(output.shape)