In [None]:
import seaborn as sns
%matplotlib inline
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
%matplotlib inline 

pd.set_option('display.max_rows', 80)
pd.set_option('display.max_columns', 50)

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from tqdm import tqdm_notebook
from catboost import CatBoostRegressor

from itertools import product
import sklearn
import scipy.sparse 
import lightgbm 

import warnings
warnings.filterwarnings("ignore")

# EDA

Import data

In [None]:
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
item_cats = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
sales = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')

test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv', index_col=['ID'])
test['date_block_num'] = 34
all_data = pd.concat([sales, test], axis=0)

Let's check the target for outliners

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=sales.item_cnt_day)

plt.figure(figsize=(10,4))
plt.xlim(sales.item_price.min(), sales.item_price.max()*1.1)
sns.boxplot(x=sales.item_price);

We should remove outliners from train data

In [None]:
sales = sales[sales.item_cnt_day < 2000]
sales = sales[sales.item_price < 300000]

We have many position, where amount of sold items less then zero. This is the return of the items. We ill cast such positions to zero.

In [None]:
sales[sales.item_cnt_day < 0]

But the price less then zero isn't normal. We will change it with median.

In [None]:
sales[sales.item_price < 0]

In [None]:
price_correction = all_data[(all_data['shop_id'] == 32) & (all_data['item_id'] == 2973) & (all_data['date_block_num'] == 4) & (all_data['item_price'] > 0)].item_price.median()
all_data.loc[all_data['item_price'] < 0, 'item_price'] = price_correction

Check shop names. There are several shops with different id's, but the same name

In [None]:
shops

In [None]:
# Якутск Орджоникидзе, 56
all_data.loc[all_data.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
all_data.loc[all_data.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
all_data.loc[all_data.shop_id == 10, 'shop_id'] = 11

# Data prepare

Make time lags for shops and items

In [None]:
def make_lag(sales, items):
    # Create "grid" with columns
    index_cols = ['shop_id', 'item_id', 'date_block_num']

    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in sales['date_block_num'].unique():
        cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into a dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Groupby data to get shop-item-month aggregates
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    # Fix column names
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
    # Join it to the grid
    all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

    # Same as above but with shop-month aggregates
    gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

    # Same as above but with item-month aggregates
    gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
    gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
    all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

    # Downcast dtypes from 64 to 32 bit to save memory
    all_data = downcast_dtypes(all_data)
    del grid, gb 
    gc.collect();
    
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
    # List of columns that we will use to create lags
    cols_to_rename = list(all_data.columns.difference(index_cols)) 

    shift_range = [1, 2, 3, 4, 5, 12]

    for month_shift in tqdm_notebook(shift_range):
        train_shift = all_data[index_cols + cols_to_rename].copy()

        train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift

        foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
        train_shift = train_shift.rename(columns=foo)

        all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

        del train_shift

    # Don't use old data from year 2013
    all_data = all_data[all_data['date_block_num'] >= 12] 

    # List of all lagged features
    fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
    # We will drop these at fitting stage
    to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

    # Category for each item
    item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

    all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
    all_data = downcast_dtypes(all_data)
    gc.collect();
    return all_data

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [None]:
X = make_lag(all_data, items)

In [None]:
X.tail()

Add a flag to check, that this item has never been sold in this shop

In [None]:
X['neversold'] = X.groupby(['item_id', 'shop_id'])['target'].transform('size').map(lambda x: 1 if x<=1 else 0)

Now I want to add different features from shop names: city, feature that indicates, that this city is Moscow district, feature to check that this shop is distance shop.

In [None]:
shops['city'] = shops.shop_name.map(lambda x: x.split()[0])
shops['city'] = shops['city'].replace('!Якутск', 'Якутск')
shops['Moscow'] = shops.city.map(lambda x: 1 if x == 'Москва' else 0)
shops['MO'] = shops.city.map(lambda x: 1 if x in ['Чехов', 'Химки', 'Сергиев', 'Мытищи', 'Коломна', 'Жуковский', 'Балашиха'] else 0)
shops['distshop'] = shops.city.map(lambda x: 1 if x in ['Выездная', 'Интернет-магазин'] else 0)
shops['city'] = shops.city.map(lambda x: 'unknown' if x in ['Выездная', 'Интернет-магазин'] else x)
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops.drop(['shop_name', 'city'], axis=1)

Then we will create features on category information: subcategory and subcategory 2. There are many common words in this categories.

In [None]:
item_cats['subcat1'] = item_cats['item_category_name'].map(lambda x: x.split(' - ')[0])
item_cats['subcat1'] = LabelEncoder().fit_transform(item_cats['subcat1'])
item_cats['subcat2'] = item_cats['item_category_name'].map(lambda x: x.split(' - ')[-1])
item_cats['subcat2'] = LabelEncoder().fit_transform(item_cats['subcat2'])
item_cats = item_cats.drop('item_category_name', axis = 1)

In [None]:
X = pd.merge(X, item_cats, on=['item_category_id'], how='left')
X = pd.merge(X, shops, on=['shop_id'], how='left')

Add mead encoder features

In [None]:
cumsum1 = X.groupby('item_id')['target'].cumsum() - X['target']
cumcnt1 = X.groupby('item_id')['target'].cumcount()
encoded_feature = cumsum1/cumcnt1
encoded_feature.fillna(0.3343, inplace=True) 
X['mean_enc_item'] = encoded_feature

del cumsum1, cumcnt1
gc;

Add several simple features

In [None]:
X['month'] = X['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
X['days'] = X['month'].map(days).astype(np.int8)


We don't know prices for items in test data, so lets use median for all items. For items which wasn't reperesented in train data we will use mean price for category

In [None]:
item_price = all_data[all_data.date_block_num < 34].groupby('item_id')['item_price'].median()
X = pd.merge(X, item_price, how='left', on='item_id')
X['item_price'] = X.groupby("item_category_id")['item_price'].transform(lambda x: x.fillna(x.mean()))

But there is one category, about which we know nothing. This is category id - 0. Let's discover it

In [None]:
X[X.item_price.isnull()].item_category_id.value_counts()

In [None]:
item_cats[item_cats.item_category_id == 0]

We have no common category, but we can find all items headphones and take their price

In [None]:
nau_id = items[items.item_name.map(str.lower).str.contains('наушники')]['item_id']

X = X.fillna(X[(X.item_price.notnull())&(X.item_id.isin(nau_id))]['item_price'].median())

Binary features for prices

In [None]:
X['low_low_price'] = (X['item_price'] < 100).astype('int8')
X['low_price'] = ((X['item_price'] >= 100)& (X['item_price'] < 300)).astype('int8')
X['medium_price'] = ((X['item_price'] >= 300)& (X['item_price'] < 500)).astype('int8')
X['high_price'] = ((X['item_price'] >= 500)& (X['item_price'] < 850)).astype('int8')
X['very_high_price'] = (X['item_price'] >= 850).astype('int8')

In [None]:
X.head()

# Models

## Model 1 - lightgbm

In [None]:
def train_test_modern(all_data):
    y_train = all_data[(all_data.date_block_num >= 12)&(all_data.date_block_num < 33)]\
        .set_index(['shop_id', 'item_id', 'date_block_num'])['target'].clip(0,20)
    y_valid = all_data[all_data.date_block_num == 33]\
        .set_index(['shop_id', 'item_id', 'date_block_num'])['target'].clip(0,20)
    all_data = all_data.drop(['target','target_shop','target_item'], axis=1)
    X_train = all_data[(all_data.date_block_num >= 12)&(all_data.date_block_num < 33)]\
        .set_index(['shop_id', 'item_id', 'date_block_num']).copy()
    X_valid = all_data[all_data.date_block_num == 33]\
        .set_index(['shop_id', 'item_id', 'date_block_num']).copy()
    X_test = all_data[all_data.date_block_num == 34]\
        .set_index(['shop_id', 'item_id', 'date_block_num']).copy()
    return X_train, X_valid, y_train, y_valid, X_test

In [None]:
def lgb_train(X_train, X_test, y_train, y_test, categorical_features, params):
    lgtrain = lightgbm.Dataset(X_train, y_train,categorical_feature=categorical_features)
    lgvalid = lightgbm.Dataset(X_test, y_test,categorical_feature=categorical_features)
    model = lightgbm.train(params, lgtrain, 4000, valid_sets=[lgvalid], early_stopping_rounds=400, verbose_eval=200)
    pred_test = model.predict(X_test, num_iteration=model.best_iteration)
    return model, pred_test

In [None]:
categorical_features = ['item_category_id', 'month', 'subcat1', 'subcat2', 'city_code']

In [None]:
X_train, X_valid, y_train, y_valid, X_test = train_test_modern(X)

In [None]:
params = {
    "objective" : "regression",
    'max_depth' : 8,
    "metric" : "rmse", 
    "num_threads" : -1,
    "learning_rate" : 0.05,
    'bagging_fraction': 0.8,
    "verbosity" : -1
    }

In [None]:
model, pred_valid_lgb = lgb_train(X_train, X_valid, y_train, y_valid, categorical_features, params)

In [None]:
np.sqrt(mean_squared_error(y_valid, pred_valid_lgb))

In [None]:
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance':model.feature_importance()}).sort_values('importance', ascending=False)[:100]

plt.figure(figsize=(10,12))
sns.barplot(x=feature_importance.importance, y=feature_importance.feature);

In [None]:
pred_test_lgb = model.predict(X_test, num_iteration=model.best_iteration )

## Model 2 - Catboost

In [None]:
def cat_train(X_train, X_test, y_train, y_test, categorical_features, params):
    model = CatBoostRegressor(**(params))
    model.fit(X_train, 
        y_train.astype(int), 
        eval_set=(X_test, y_test.astype(int)), 
        cat_features=categorical_features, 
        use_best_model=True, 
        verbose=200)

    pred = model.predict(X_test)
    return model, pred

In [None]:
params_cat =  {'iterations': 1000,
    'random_seed': 63,
    'learning_rate': 0.05,
    'eval_metric': 'RMSE',
    'bagging_temperature': 0.2,
    'early_stopping_rounds': 200,
    'leaf_estimation_method': 'Newton'}

In [None]:
model_cat, pred_valid_cat = cat_train(X_train, X_valid, y_train, y_valid, categorical_features, params_cat)

In [None]:
np.sqrt(mean_squared_error(y_valid, pred_valid_cat))

In [None]:
pred_test_cat = model_cat.predict(X_test)

## Model 2 - Liner Regression

In [None]:
columns_to_scale = ['target_lag_1', 'target_item_lag_1', 'target_shop_lag_1',
       'target_lag_2', 'target_item_lag_2', 'target_shop_lag_2',
       'target_lag_3', 'target_item_lag_3', 'target_shop_lag_3',
       'target_lag_4', 'target_item_lag_4', 'target_shop_lag_4',
       'target_lag_5', 'target_item_lag_5', 'target_shop_lag_5',
       'target_lag_12', 'target_item_lag_12', 'target_shop_lag_12', 'mean_enc_item', 'item_price']

In [None]:
scaler = StandardScaler()
X_train_lr = scaler.fit_transform(X_train[columns_to_scale])
X_valid_lr = scaler.transform(X_valid[columns_to_scale])
X_test_lr = scaler.transform(X_test[columns_to_scale])

In [None]:
lr = LinearRegression()
lr.fit(X_train_lr, y_train)
pred_valid_lr = lr.predict(X_valid_lr)

In [None]:
np.sqrt(mean_squared_error(y_valid, pred_valid_lr))

In [None]:
pred_test_lr = lr.predict(X_test_lr)

## Model3 - KNN Regressor

In [None]:
knn_model = KNeighborsRegressor(n_neighbors=9, leaf_size=13, n_jobs=-1)
knn_model.fit(X_train_lr[-30000:], y_train[-30000:])
pred_valid_knn = knn_model.predict(X_valid_lr)

In [None]:
np.sqrt(mean_squared_error(y_valid, pred_valid_knn))

In [None]:
pred_test_knn = knn_model.predict(X_test_lr)

## Ensembling

Now we should create table from the predictions of all models

In [None]:
train_new = pd.DataFrame(index=X_valid.index)
train_new['lgb'] = pred_valid_lgb
train_new['catboost'] = pred_valid_cat
train_new['lr'] = pred_valid_lr
train_new['knn'] = pred_valid_knn

Fit meta model

In [None]:
lr_meta = LinearRegression()
lr_meta.fit(train_new, y_valid)

In [None]:
test_new = pd.DataFrame(index=X_test.index)
test_new['lgb'] = pred_test_lgb
test_new['catboost'] = pred_test_cat
test_new['lr'] = pred_test_lr
test_new['knn'] = pred_test_knn

Make final prediction

In [None]:
result = lr_meta.predict(test_new)

Create submission

In [None]:
def submis_write(res, filename='sub.csv'):
    submis = pd.DataFrame({'ID': test.index, 'item_cnt_month': res})
    submis.to_csv(filename, index=False)

In [None]:
submis_write(result)