In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
import matplotlib.pyplot as plt

import lightgbm as lgbm
import gc
import pickle as pickle


from sklearn.model_selection import KFold
from itertools import product

In [3]:
items           = pd.read_csv('items.csv',usecols=["item_id", "item_category_id"])
item_categories = pd.read_csv('item_categories.csv')
shops           = pd.read_csv('shops.csv')
sales_train     = pd.read_csv('sales_train.csv.gz')
test            = pd.read_csv('test.csv.gz')

In [4]:
sales_train[['day','month', 'year']] = sales_train['date'].str.split('.', expand=True).astype(int)
sales_train = sales_train[sales_train['year'].isin([2013,2014]) == False]
sales_train = sales_train.set_index('item_id').join(items.set_index('item_id'))
sales_train.reset_index(inplace=True)

In [5]:
# Якутск Орджоникидзе, 56
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [6]:
max_sales=1000
sums = sales_train.groupby('item_id')['item_cnt_day'].sum().reset_index().rename(columns={"item_cnt_day":"item_total_sales"}).sort_values(by='item_total_sales')

ids_reject = sums[(sums['item_total_sales'] > 0) & (sums['item_total_sales'] < max_sales)]['item_id'].unique()

In [7]:
train_item_ids = sales_train['item_id'].unique()
train_item_ids = np.setdiff1d(train_item_ids, ids_reject)
train_shop_ids = sales_train['shop_id'].unique()
test_item_ids = test['item_id'].unique()
test_shop_ids = test['shop_id'].unique()
train_blocks = sales_train['date_block_num'].unique()

all_item_ids = np.unique(np.append(test_item_ids,train_item_ids))
all_shop_ids = np.unique(np.append(train_shop_ids,test_shop_ids))

In [8]:
combinations = []

for dbn in range(np.min(train_blocks), np.max(train_blocks)+1):
    sales = sales_train[sales_train.date_block_num==dbn]
    item_ids = np.intersect1d(sales.item_id.unique(), test_item_ids)
    dbn_combos = list(product(sales.shop_id.unique(), item_ids, [dbn]))
    for combo in dbn_combos:
        combinations.append(combo)
        
all_combos = pd.DataFrame(np.unique(np.vstack([combinations]), axis=0), columns=['shop_id','item_id','date_block_num'])

In [9]:
ys = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day']\
                .sum().rename(columns={"item_cnt_day":"item_cnt_block"})

training = all_combos.merge(ys, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)


training['item_cnt_block'] = training['item_cnt_block'].clip(0,20).astype('int8')

training = training.set_index('item_id').join(items.set_index('item_id'))
training.reset_index(inplace=True)

for col in ['item_id', 'shop_id', 'item_category_id']:
    training[col] = pd.to_numeric(training[col], downcast='unsigned')

In [10]:
dates = sales_train[['date_block_num', 'month', 'year']].drop_duplicates(['date_block_num', 'month', 'year'])

dates_dict = {}

for index,row in dates.iterrows():
    dates_dict[row['date_block_num']] = {"month": row['month'], "year": row['year']}
    
training['month'] = pd.to_numeric(training['date_block_num'].apply(lambda block: dates_dict[block]['month']), downcast='unsigned')


In [11]:
training["shop_cat"] = training["shop_id"].astype(str) + "_" + training["item_category_id"].astype(str)
training["shop_item"] = training["shop_id"].astype(str) + "_" + training["item_id"].astype(str)

In [12]:
unique_shop_cats = training['shop_cat'].unique()
shop_cats = dict(list(zip(unique_shop_cats, range(1,len(unique_shop_cats)))))

def get_shop_cat_int(x):
    if x in shop_cats:
        return shop_cats[x]

training['shop_cat_int'] = training['shop_cat'].apply(lambda x: get_shop_cat_int(x))

In [13]:
unique_shop_items = training['shop_item'].unique()
shop_items = dict(list(zip(unique_shop_items, range(1,len(unique_shop_items)))))

def get_shop_item_int(x):
    if x in shop_items:
        return shop_items[x]

training['shop_item_int'] = training['shop_item'].apply(lambda x: get_shop_item_int(x))

In [14]:
#https://maxhalford.github.io/blog/target-encoding-done-the-right-way/
#https://www.kaggle.com/vprokopev/mean-likelihood-encodings-a-comprehensive-study

from sklearn.model_selection import StratifiedKFold
columns = ["item_id", "shop_id", "item_category_id", "month", "shop_cat", "shop_item", "date_block_num"]



y_train = training["item_cnt_block"].values
folds = KFold(n_splits = 5, shuffle=True).split(training)

i=1
for in_fold_index, out_of_fold_index in folds:
    print("fold", i)
    #print(np.intersect1d(training.loc[in_fold_index]["shop_id"].unique(), training.loc[out_of_fold_index]["shop_id"].unique()))
    #print(len(in_fold_index))
    for column in columns:
        means = training.iloc[in_fold_index].groupby(column)['item_cnt_block'].mean()
            #x_validation[column + "_mean_target"] = means\
        name = column + '_mean_encoding'
        training.loc[out_of_fold_index,name] = training.loc[out_of_fold_index][column].map(means)
    i+=1

fold 1
fold 2
fold 3
fold 4
fold 5


In [15]:
def add_block_units_stats(df, cols, name):
    print(name)
    name_units = name + '_units'
    name_mean = name + '_mean'
    name_median = name + '_median'
    name_max = name + '_max'
    name_min = name + '_min'
    name_std = name + '_std'
    
    
    try:
        df.drop(columns=[name_units, name_mean, name_median],inplace=True)
    except:
        pass

    
    block_units = df.groupby(cols,as_index=False)['item_cnt_block'].sum()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_units})
    df = df.merge(block_units, on=cols, how='left')
    df[name_units].fillna(0,inplace=True)
    df[name_units] = pd.to_numeric(df[name_units].astype(int),downcast='unsigned')
    del block_units
    
    block_units_med = df.groupby(cols,as_index=False)['item_cnt_block'].median()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_median})
    df = df.merge(block_units_med, on=cols, how='left')
    df[name_median].fillna(0,inplace=True)
    df[name_median] = pd.to_numeric(df[name_median].astype(int),downcast='unsigned')
    del block_units_med
    
    block_means = df.groupby(cols,as_index=False)['item_cnt_block'].mean()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_mean})
    df = df.merge(block_means, on=cols, how='left')
    df[name_mean].fillna(0,inplace=True)
    df[name_mean] = pd.to_numeric(df[name_mean],downcast='float')
    del block_means
    
    block_max = df.groupby(cols,as_index=False)['item_cnt_block'].max()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_max})
    df = df.merge(block_max, on=cols, how='left')
    df[name_max].fillna(0,inplace=True)
    df[name_max] = pd.to_numeric(df[name_max],downcast='float')
    del block_max
    
    block_min = df.groupby(cols,as_index=False)['item_cnt_block'].min()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_min})
    df = df.merge(block_min, on=cols, how='left')
    df[name_min].fillna(0,inplace=True)
    df[name_min] = pd.to_numeric(df[name_min],downcast='float')
    del block_min
    
    block_std = df.groupby(cols,as_index=False)['item_cnt_block'].std()\
                        .drop_duplicates(cols)\
                        .rename(columns={'item_cnt_block':name_std})
    df = df.merge(block_std, on=cols, how='left')
    df[name_std].fillna(0,inplace=True)
    df[name_std] = pd.to_numeric(df[name_std],downcast='float')
    del block_std
    
    gc.collect()
    return df


training = add_block_units_stats(training, ['item_id','date_block_num'], 'item_block')
training = add_block_units_stats(training, ['shop_id','date_block_num'], 'shop_block')
training = add_block_units_stats(training, ['item_category_id','date_block_num'], 'cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_category_id','date_block_num'], 'shop_cat_block')
training = add_block_units_stats(training, ['shop_id', 'item_id','date_block_num'], 'shop_item_block')

item_block




shop_block
cat_block
shop_cat_block
shop_item_block


In [16]:
def add_rolls(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name_tmp})\
            [cols+[roll_name_tmp]]
        
    
        df = df.merge(block_units_rolling_temp, on=cols, how='left')
        #print(df.columns.values)
        del block_units_rolling_temp
        gc.collect()
        

        block_units_rolling = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [roll_name_tmp].shift(1)\
            .rename(columns={roll_name_tmp:roll_name}).reset_index()

        df = df.merge(block_units_rolling, on=cols, how='left')
        df[roll_name].fillna(0,inplace=True)
        df[roll_name] = pd.to_numeric(df[roll_name], downcast='float')
        df.drop(columns=[roll_name_tmp], inplace=True)
        del block_units_rolling
        gc.collect()
    
    return df
    

#training = add_rolls(training, ['item_id','date_block_num'], 'item_block_units')
#training = add_rolls(training, ['item_id','date_block_num'], 'item_block_mean')
#training = add_rolls(training, ['item_id','date_block_num'], 'item_block_median')
#training = add_rolls(training, ['item_id','date_block_num'], 'item_block_min')
#training = add_rolls(training, ['item_id','date_block_num'], 'item_block_max')
#training = add_rolls(training, ['item_id','date_block_num'], 'item_block_std')

#training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_units')
#training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_mean')
#training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_median')
#training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_min')
#training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_max')
#training = add_rolls(training, ['shop_id','date_block_num'], 'shop_block_std')

#training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_units')
#training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_mean')
#training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_median')
#training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_min')
#training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_max')
#training = add_rolls(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_rolls(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item')

shop_cat_block_units 3
shop_cat_block_mean 3
shop_cat_block_median 3
shop_cat_block_min 3
shop_cat_block_max 3
shop_cat_block_std 3


In [17]:
#training = add_rolls(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

In [18]:
training['block_total'] = training.groupby(['date_block_num'])['item_cnt_block'].transform(np.sum)

training['item_share_block'] = training['item_block_units'] * 100 / training['block_total']
training['shop_share_block'] = training['shop_block_units'] * 100 / training['block_total']
training['comp2'] = training['item_share_block'] * training['shop_share_block']

In [19]:
def add_lags(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = df\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()

        df = df.merge(result, on=cols, how='left')
        df[lag_name].fillna(0,inplace=True)
        if "mean" in name or "std" in name:
            df[lag_name] = pd.to_numeric(df[lag_name], downcast='float')
        else:
            df[lag_name] = pd.to_numeric(df[lag_name].astype(int), downcast='unsigned')
        del result
        gc.collect()
    
    return df
                                         

                                        
#training = add_lags(training, ['item_id','date_block_num'], 'item_block_units')
#training = add_lags(training, ['item_id','date_block_num'], 'item_block_mean')
#training = add_lags(training, ['item_id','date_block_num'], 'item_block_median')                                        
#training = add_lags(training, ['item_id','date_block_num'], 'item_block_min')
#training = add_lags(training, ['item_id','date_block_num'], 'item_block_max')
#training = add_lags(training, ['item_id','date_block_num'], 'item_block_std')

training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_units')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_mean')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_median')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_min')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_max')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_block_std')

#training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_units')
#training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_mean')
#training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_median')
#training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_min')
#training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_max')
#training = add_lags(training, ['item_category_id','date_block_num'], 'cat_block_std')

training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
training = add_lags(training, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')

training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_median')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_min')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_max')
training = add_lags(training, ['shop_id','item_id','date_block_num'], 'shop_item_block_std')

shop_block_units 1
shop_block_mean 1
shop_block_median 1
shop_block_min 1
shop_block_max 1
shop_block_std 1
shop_cat_block_units 1
shop_cat_block_mean 1
shop_cat_block_median 1
shop_cat_block_min 1
shop_cat_block_max 1
shop_cat_block_std 1
shop_item_block_units 1
shop_item_block_mean 1
shop_item_block_median 1
shop_item_block_min 1
shop_item_block_max 1
shop_item_block_std 1


In [20]:
training = add_lags(training, ['item_id','date_block_num'], 'item_share_block')
training = add_lags(training, ['shop_id','date_block_num'], 'shop_share_block')
training = add_lags(training, ['shop_id', 'item_id', 'date_block_num'], 'comp2')


item_share_block 1
shop_share_block 1
comp2 1


In [21]:
total_sum_shops = training.groupby('shop_id')['item_cnt_block'].sum().sum()
training['shop_share'] = training.groupby('shop_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_shops

total_sum_items = training.groupby('item_id')['item_cnt_block'].sum().sum()
training['item_share'] = training.groupby('item_id')['item_cnt_block'].transform(np.sum) *100 / total_sum_items

In [22]:
training['comp1'] = training['shop_share'] * training['item_share']

In [23]:
a = training[training['shop_id'] == 41].groupby(['shop_id', 'date_block_num'],as_index=False)['item_cnt_block'].sum()
a

Unnamed: 0,shop_id,date_block_num,item_cnt_block
0,41,24,974.0
1,41,25,687.0
2,41,26,670.0
3,41,27,528.0
4,41,28,726.0
5,41,29,583.0
6,41,30,655.0
7,41,31,834.0
8,41,32,629.0
9,41,33,722.0


In [24]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler


shop_models = {}

for shop_id in all_shop_ids:
    
    shop_data = training[training['shop_id'] == shop_id].groupby(['date_block_num'],as_index=False)['item_cnt_block'].sum()


    regr = linear_model.Ridge()

    X = shop_data['date_block_num'].values.reshape(len(shop_data),1)
    y = shop_data['item_cnt_block'].values.reshape(len(shop_data),1)
    #y = MinMaxScaler().fit_transform(y)

    # Train the model using the training sets
    regr.fit(X, y)
    shop_models[shop_id] = regr

    # Make predictions using the testing set
    #preds = regr.predict(X)

# The coefficients
#print('Coefficients: \n', regr.coef_)
# The mean squared error
#print("Mean squared error: %.2f"
 #     % mean_squared_error(X, preds))
# Explained variance score: 1 is perfect prediction
#print('Variance score: %.2f' % r2_score(X, preds))

# Plot outputs
#plt.scatter(X, y,  color='black')
#plt.plot(X, preds, color='blue', linewidth=3)

#plt.xticks(())
#plt.yticks(())

#plt.show()

print("applying")

def predict(shop_id, dbn):
    return shop_models[shop_id].predict([[dbn]])[0][0]

training['shop_pred'] = training.apply(lambda row: predict(row['shop_id'], row['date_block_num']), axis=1)

applying


In [25]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler


shop_cat_models = {}

for shop_id in all_shop_ids:
    shop_cat_models[shop_id] = {}
    for cat_id in training['item_category_id'].unique():
    
        shop_cat_data = training[(training['shop_id'] == shop_id) & (training['item_category_id'] == cat_id)].groupby(['date_block_num'],as_index=False)['item_cnt_block'].sum()
        if len(shop_cat_data) == 0:
            continue

        regr = linear_model.Ridge()

        X = shop_cat_data['date_block_num'].values.reshape(len(shop_cat_data),1)
        y = shop_cat_data['item_cnt_block'].values.reshape(len(shop_cat_data),1)
        
        #y = MinMaxScaler().fit_transform(y)

        # Train the model using the training sets
        regr.fit(X, y)
        shop_cat_models[shop_id][cat_id] = regr
            

    # Make predictions using the testing set
    #preds = regr.predict(X)


print("applying")

def predict(shop_id, cat_id, dbn):
    return shop_cat_models[shop_id][cat_id].predict([[dbn]])[0][0]

training['shop_cat_pred'] = training.apply(lambda row: predict(row['shop_id'],row['item_category_id'], row['date_block_num']), axis=1)

applying


In [26]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler


shop_item_models = {}

for shop_id in all_shop_ids:
    print(shop_id)
    shop_item_models[shop_id] = {}
    for item_id in training['item_id'].unique():
    
        shop_item_data = training[(training['shop_id'] == shop_id) & (training['item_id'] == item_id)].groupby(['date_block_num'],as_index=False)['item_cnt_block'].sum()
        if len(shop_item_data) == 0:
            continue

        regr = linear_model.Ridge()

        X = shop_item_data['date_block_num'].values.reshape(len(shop_item_data),1)
        y = shop_item_data['item_cnt_block'].values.reshape(len(shop_item_data),1)
        
        #y = MinMaxScaler().fit_transform(y)

        # Train the model using the training sets
        regr.fit(X, y)
        shop_item_models[shop_id][item_id] = regr
            

    # Make predictions using the testing set
    #preds = regr.predict(X)


print("applying")

def predict(shop_id, item_id, dbn):
    return shop_item_models[shop_id][item_id].predict([[dbn]])[0][0]

training['shop_item_pred'] = training.apply(lambda row: predict(row['shop_id'],row['item_id'], row['date_block_num']), axis=1)

2
3
4
5
6
7
9
11
12
14
15
16
17
18
19
20
21
22
24
25
26
27
28
29
30
31
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
applying


In [28]:
import pickle
pickling_on = open("shop_item_models.pickle","wb")
pickle.dump(shop_item_models, pickling_on)
pickling_on.close()

In [30]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler


cat_models = {}

for cat_id in training['item_category_id'].unique():
    
    cat_data = training[(training['item_category_id'] == cat_id)].groupby(['date_block_num'],as_index=False)['item_cnt_block'].sum()
    if len(cat_data) == 0:
        continue

    regr = linear_model.Ridge()

    X = cat_data['date_block_num'].values.reshape(len(cat_data),1)
    y = cat_data['item_cnt_block'].values.reshape(len(cat_data),1)

    #y = MinMaxScaler().fit_transform(y)

    # Train the model using the training sets
    regr.fit(X, y)
    cat_models[cat_id] = regr


    # Make predictions using the testing set
    #preds = regr.predict(X)


print("applying")

def predict(cat_id, dbn):
    return cat_models[cat_id].predict([[dbn]])[0][0]

training['cat_pred'] = training.apply(lambda row: predict(row['item_category_id'], row['date_block_num']), axis=1)

applying


In [31]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler,StandardScaler,MinMaxScaler


item_models = {}

for item_id in training['item_id'].unique():
    
    item_data = training[(training['item_id'] == item_id)].groupby(['date_block_num'],as_index=False)['item_cnt_block'].sum()
    if len(item_data) == 0:
        continue

    regr = linear_model.Ridge()

    X = item_data['date_block_num'].values.reshape(len(item_data),1)
    y = item_data['item_cnt_block'].values.reshape(len(item_data),1)

    #y = MinMaxScaler().fit_transform(y)

    # Train the model using the training sets
    regr.fit(X, y)
    item_models[item_id] = regr


    # Make predictions using the testing set
    #preds = regr.predict(X)


print("applying")

def predict(item_id, dbn):
    if item_id in item_models:
        return item_models[item_id].predict([[dbn]])[0][0]

training['item_pred'] = training.apply(lambda row: predict(row['item_id'], row['date_block_num']), axis=1)

applying


In [32]:
training['pred_comp1'] = training['item_pred'] * training['shop_pred']
training['pred_comp2'] = training['shop_pred'] * training['cat_pred']

In [33]:
training['blocks_without_sales'] = training['item_id'].map(training[training['item_cnt_block'] == 0].groupby(['item_id'])['date_block_num'].unique().apply(lambda x: len(x)))

In [34]:
sales_train['item_days_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date'].transform("nunique"), downcast='unsigned') 
sales_train['item_blocks_of_activity'] = pd.to_numeric(sales_train.groupby(['item_id'])['date_block_num'].transform("nunique"), downcast='unsigned') 

def get_number_of_days_since_start(day,month, year):
    days = 0
    if year == 2015:
        days = 365
    def is_even(num):
        return num % 2 == 0
    half_of_month = int(month/2)
    even = (30*half_of_month) + (31*half_of_month)
    if is_even(month):
        days = days + even - 30 - day
    else:
        days = days + even + day
    return days

sales_train['item_days_since_start'] = pd.to_numeric(sales_train.apply(lambda row: get_number_of_days_since_start(row['day'],row['month'], row['year']),axis=1), downcast='unsigned') 

def get_average_days_between_sales(days):
    days = sorted(np.unique(days))
    if len(days) == 0:
        return 9999
    if len(days) == 1:
        return 999
    return np.mean(np.ediff1d(days)) / len(days)

average_days_between_sales = sales_train.groupby(['item_id'])['item_days_since_start'].apply(list).apply(lambda x: get_average_days_between_sales(x))

sales_train['item_mean_day_between_activity'] = pd.to_numeric(sales_train['item_id'].map(average_days_between_sales), downcast='unsigned') 


In [35]:
training['item_mean_day_between_activity'] = training['item_id'].map(sales_train.drop_duplicates('item_id').set_index('item_id')['item_mean_day_between_activity'])

In [36]:

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
training.sample(10)

Unnamed: 0,item_id,shop_id,date_block_num,item_cnt_block,item_category_id,month,shop_cat,shop_item,shop_cat_int,shop_item_int,item_id_mean_encoding,shop_id_mean_encoding,item_category_id_mean_encoding,month_mean_encoding,shop_cat_mean_encoding,shop_item_mean_encoding,date_block_num_mean_encoding,item_block_units,item_block_median,item_block_mean,item_block_max,item_block_min,item_block_std,shop_block_units,shop_block_median,shop_block_mean,shop_block_max,shop_block_min,shop_block_std,cat_block_units,cat_block_median,cat_block_mean,cat_block_max,cat_block_min,cat_block_std,shop_cat_block_units,shop_cat_block_median,shop_cat_block_mean,shop_cat_block_max,shop_cat_block_min,shop_cat_block_std,shop_item_block_units,shop_item_block_median,shop_item_block_mean,shop_item_block_max,shop_item_block_min,shop_item_block_std,shop_cat_block_units_rolling_3,shop_cat_block_mean_rolling_3,shop_cat_block_median_rolling_3,shop_cat_block_min_rolling_3,shop_cat_block_max_rolling_3,shop_cat_block_std_rolling_3,block_total,item_share_block,shop_share_block,comp2,shop_block_units_lag_1,shop_block_mean_lag_1,shop_block_median_lag_1,shop_block_min_lag_1,shop_block_max_lag_1,shop_block_std_lag_1,shop_cat_block_units_lag_1,shop_cat_block_mean_lag_1,shop_cat_block_median_lag_1,shop_cat_block_min_lag_1,shop_cat_block_max_lag_1,shop_cat_block_std_lag_1,shop_item_block_units_lag_1,shop_item_block_mean_lag_1,shop_item_block_median_lag_1,shop_item_block_min_lag_1,shop_item_block_max_lag_1,shop_item_block_std_lag_1,item_share_block_lag_1,shop_share_block_lag_1,comp2_lag_1,shop_share,item_share,comp1,shop_pred,shop_cat_pred,shop_item_pred,cat_pred,item_pred,pred_comp1,pred_comp2,blocks_without_sales,item_mean_day_between_activity
809935,12563,21,25,0,55,2,21_55,21_12563,746.0,128309.0,0.161111,0.43974,0.260822,0.456293,0.320187,0.125,0.456293,5,0,0.106383,1,0,0.0,1279,0,0.464922,20,0,0.0,6820,0,0.294932,15,0,0.0,149,0,0.302846,6,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59142.0,0.008454,1.054479,0.008915,1346,0.49613,0,0,20,0.0,168,0.344262,0,0,13,0.0,0,0.0,0,0,0,0.0,0,0,0,2.522849,0.013034,0.032882,1290.185629,174.990419,-0.046707,7227.705389,8.452695,10905.545111,9325082.0,10,0.06938
1250060,18939,55,32,0,40,9,55_40,55_18939,49.0,201896.0,0.151899,0.300867,0.356499,0.34126,0.0,0.0,0.34126,4,0,0.093023,1,0,0.0,1041,0,0.28654,20,0,0.0,5930,0,0.272543,20,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53347.0,0.007498,0.72289,0.00542,955,0.276331,0,0,20,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0,0,0.0,0,0,0,1.720108,0.008927,0.015356,1022.040719,0.0,0.0,6615.100599,3.280116,3352.412226,6760902.0,9,0.139535
929082,14220,4,30,0,57,7,4_57,4_14220,159.0,150587.0,0.074074,0.227151,0.093348,0.360546,0.0,0.0,0.360546,2,0,0.046512,1,0,0.0,741,0,0.223732,20,0,0.0,428,0,0.104774,7,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51454.0,0.003887,0.16644,0.000647,666,0.212169,0,0,20,0.0,0,0.0,0,0,0,0.0,0,0.0,0,0,0,0.0,0,0,0,1.294277,0.004642,0.006008,715.621557,0.0,0.0,368.022754,2.829105,2024.568864,263365.0,9,0.445
702845,10785,34,30,0,55,7,34_55,34_10785,757.0,111515.0,0.191549,0.114956,0.260241,0.359833,0.006062,0.0,0.359833,5,0,0.116279,1,0,0.0,380,0,0.114734,20,0,0.0,6036,0,0.255222,15,0,0.0,5,0,0.009091,1,0,0.0,0,0,0,0,0,0.0,4.0,0.007479,0.0,0.0,1.0,0.0,51454.0,0.009717,0.738524,0.007177,350,0.1115,0,0,18,0.0,5,0.009259,0,0,1,0.0,0,0.0,0,0,0,0.0,0,0,0,0.647406,0.017319,0.011212,373.701796,3.08024,0.0,5839.411976,6.205988,2319.188873,2182199.0,10,0.064904
80570,1560,4,31,3,23,8,4_23,4_1560,1153.0,13069.0,1.064748,0.228222,0.602223,0.389709,0.43153,0.5,0.389709,74,2,1.761905,6,0,0.0,810,0,0.234375,20,0,0.0,3539,0,0.708083,20,0,0.0,63,0,0.529412,5,0,0.0,3,3,3,3,3,0.0,37.333332,0.345627,0.0,0.0,3.666667,0.0,56508.0,0.130955,0.27366,0.035837,741,0.223732,0,0,20,0.0,37,0.321739,0,0,4,0.0,0,0.0,0,0,0,0.0,0,0,0,1.294277,0.03196,0.041365,709.435928,44.918563,1.083333,2716.007186,42.208333,29944.108134,1926833.0,4,0.038265
201066,3274,49,28,2,55,5,49_55,49_3274,772.0,31957.0,0.487685,0.20358,0.260822,0.370292,0.132598,0.0,0.370292,32,0,0.727273,8,0,0.0,593,0,0.195002,20,0,0.0,5793,0,0.246553,19,0,0.0,57,0,0.106742,3,0,0.0,2,2,2,2,2,0.0,58.666668,0.116344,0.0,0.0,3.333333,0.0,49744.0,0.064329,1.192104,0.076687,456,0.153794,0,0,20,0.0,58,0.112621,0,0,3,0.0,0,0.0,0,0,0,0.0,0,0,0,1.177508,0.022675,0.0267,658.934132,72.186826,1.009009,6394.729341,35.693694,23519.793063,4213705.0,6,0.045675
308296,4548,49,25,0,75,2,49_75,49_4548,878.0,48665.0,0.137427,0.205644,0.542641,0.456494,0.133333,0.0,0.456494,7,0,0.148936,2,0,0.0,596,0,0.216648,20,0,0.0,949,0,0.67305,20,0,0.0,6,0,0.2,3,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59142.0,0.011836,1.007744,0.011928,867,0.319572,0,0,20,0.0,8,0.275862,0,0,4,0.0,0,0.0,0,0,0,0.0,0,0,0,1.177508,0.011605,0.013666,655.538922,5.131737,0.0,924.780838,7.401198,4851.773101,606229.8,10,0.087013
961790,14828,42,28,3,65,5,42_65,42_14828,1965.0,156071.0,0.391892,0.881569,0.627855,0.372315,2.067568,1.857143,0.372315,19,0,0.431818,5,0,0.0,2549,0,0.838211,20,0,0.0,1489,0,0.537157,20,0,0.0,146,0,2.31746,20,0,0.0,3,3,3,3,3,0.0,142.666672,2.578129,0.666667,0.0,20.0,0.0,49744.0,0.038196,1.17184,0.044759,2647,0.892749,0,0,20,0.0,104,1.733333,0,0,20,0.0,0,0.0,0,0,0,0.0,0,0,0,5.006776,0.030888,0.154651,2762.128144,133.958084,2.034731,1808.653892,19.668263,54326.264077,4995734.0,10,0.054125
1300536,19817,27,26,9,40,3,27_40,27_19817,22.0,209919.0,1.853503,0.770264,0.357591,0.435588,0.457861,3.8,0.435588,241,4,5.23913,20,0,0.0,2763,0,0.950138,20,0,0.0,6871,0,0.417233,20,0,0.0,219,0,0.611732,11,0,0.0,9,9,9,9,9,0.0,171.0,0.536657,0.0,0.0,17.5,0.0,58665.0,0.410807,0.241302,0.099129,2687,0.976736,0,0,20,0.0,153,0.463636,0,0,15,0.0,15,15.0,15,15,15,0.0,0,0,0,3.392902,0.126767,0.430109,2914.823096,195.793612,8.426295,6587.328144,149.020036,434367.043926,19200900.0,9,0.018595
76722,1539,7,29,2,30,6,7_30,7_1539,1103.0,12514.0,1.004902,0.373703,1.121034,0.376414,1.301031,1.0,0.376414,13,0,0.302326,2,0,0.0,1076,0,0.342784,20,0,0.0,5479,0,1.044415,20,0,0.0,126,0,1.032787,17,0,0.0,2,2,2,2,2,0.0,144.666672,1.200883,0.333333,0.0,16.333334,0.0,50629.0,0.025677,0.830828,0.021333,948,0.31174,0,0,20,0.0,134,1.080645,0,0,20,0.0,0,0.0,0,0,0,0.0,0,0,0,2.109338,0.046957,0.099049,1172.82515,155.548503,1.207207,5805.228743,31.063063,36431.541587,6808518.0,6,0.023302


In [37]:
training.columns.values

array(['item_id', 'shop_id', 'date_block_num', 'item_cnt_block',
       'item_category_id', 'month', 'shop_cat', 'shop_item',
       'shop_cat_int', 'shop_item_int', 'item_id_mean_encoding',
       'shop_id_mean_encoding', 'item_category_id_mean_encoding',
       'month_mean_encoding', 'shop_cat_mean_encoding',
       'shop_item_mean_encoding', 'date_block_num_mean_encoding',
       'item_block_units', 'item_block_median', 'item_block_mean',
       'item_block_max', 'item_block_min', 'item_block_std',
       'shop_block_units', 'shop_block_median', 'shop_block_mean',
       'shop_block_max', 'shop_block_min', 'shop_block_std',
       'cat_block_units', 'cat_block_median', 'cat_block_mean',
       'cat_block_max', 'cat_block_min', 'cat_block_std',
       'shop_cat_block_units', 'shop_cat_block_median',
       'shop_cat_block_mean', 'shop_cat_block_max', 'shop_cat_block_min',
       'shop_cat_block_std', 'shop_item_block_units',
       'shop_item_block_median', 'shop_item_block_mean',
  

In [38]:
training.fillna(0,inplace=True)
training = training.sample(frac=1).reset_index(drop=True)


In [39]:
gc.collect()

ZEROS_KEEP=0.25

#x_train = training[(training['date_block_num'] < 33) & (training['val_ignore'] == False)]
x_train = training[(training['date_block_num'] < 33)]
y_train = x_train['item_cnt_block']





x_val = training[training['date_block_num'] == 33]
y_val = x_val['item_cnt_block']

pos_val_len = len(y_val[y_val != 0])
print("pos_val_len", pos_val_len)

zeros_keep_indices_val = y_val[y_val == 0].sample(int(pos_val_len/ZEROS_KEEP)).index
print("zeros_keep_indices_val", len(zeros_keep_indices_val))
non_zeros_val_indices = y_val[y_val != 0].index
print("non_zeros_val_indices", len(non_zeros_val_indices))

val_indices = np.append(np.array(zeros_keep_indices_val), np.array(non_zeros_val_indices))

y_val = y_val.loc[val_indices]
x_val = x_val.loc[val_indices]



pos_val_len 29202
zeros_keep_indices_val 116808
non_zeros_val_indices 29202


In [88]:
features = [
    
'item_id_mean_encoding',
       'shop_id_mean_encoding', #'item_category_id_mean_encoding',
#       'month_mean_encoding', 'shop_cat_mean_encoding',
#       'shop_item_mean_encoding', 'date_block_num_mean_encoding',
 
#       'item_block_units_rolling_3', 'item_block_mean_rolling_3',
#       'item_block_median_rolling_3', 'item_block_min_rolling_3',
#       'item_block_max_rolling_3', 'item_block_std_rolling_3',
    
#       'shop_block_units_rolling_3', 'shop_block_mean_rolling_3',
#       'shop_block_median_rolling_3', 'shop_block_min_rolling_3',
#       'shop_block_max_rolling_3', 'shop_block_std_rolling_3',
    
#       'cat_block_units_rolling_3', 'cat_block_mean_rolling_3',
#       'cat_block_median_rolling_3', 'cat_block_min_rolling_3',
#       'cat_block_max_rolling_3', 'cat_block_std_rolling_3',
    
       'shop_cat_block_units_rolling_3', 'shop_cat_block_mean_rolling_3',
       'shop_cat_block_median_rolling_3', 'shop_cat_block_min_rolling_3',
       'shop_cat_block_max_rolling_3', 'shop_cat_block_std_rolling_3',
    
    
#       'item_block_units_lag_1', 'item_block_mean_lag_1',
#       'item_block_median_lag_1', 'item_block_min_lag_1',
#       'item_block_max_lag_1', 'item_block_std_lag_1',
    
#       'shop_block_units_lag_1', 'shop_block_mean_lag_1',
#       'shop_block_median_lag_1', 'shop_block_min_lag_1',
       'shop_block_max_lag_1', 'shop_block_std_lag_1',
    
#      'cat_block_units_lag_1', 'cat_block_mean_lag_1',
#       'cat_block_median_lag_1', 'cat_block_min_lag_1',
#       'cat_block_max_lag_1', 'cat_block_std_lag_1',
    
       'shop_cat_block_units_lag_1', 'shop_cat_block_mean_lag_1',
#       'shop_cat_block_median_lag_1', 'shop_cat_block_min_lag_1',
#       'shop_cat_block_max_lag_1', 'shop_cat_block_std_lag_1',
    
       'shop_item_block_units_lag_1', 'shop_item_block_mean_lag_1',
#       'shop_item_block_median_lag_1', 'shop_item_block_min_lag_1',
#       'shop_item_block_max_lag_1', 'shop_item_block_std_lag_1',
    
       'item_share_block_lag_1', 'shop_share_block_lag_1', 'comp2_lag_1',
    
    'shop_pred', 
    'shop_cat_pred', 
    'cat_pred',
    #'item_pred',
    #'pred_comp1',
    #'pred_comp2',
    'blocks_without_sales',
    #'item_mean_day_between_activity',
    #'shop_item_pred'

]


In [89]:
from sklearn.preprocessing import MinMaxScaler

x_train_scaled = MinMaxScaler().fit_transform(x_train[features])
x_val_scaled = MinMaxScaler().fit_transform(x_val[features])

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [90]:
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression,BayesianRidge, HuberRegressor


lr_model =  Ridge(alpha=0.1)
lr_model.fit(x_train_scaled, y_train)

from sklearn.metrics import mean_squared_error
from math import sqrt

lr_val_preds = lr_model.predict(x_val_scaled)
lr_val_preds.clip(0,20,out=lr_val_preds)
rms = sqrt(mean_squared_error(y_val, lr_val_preds))
print("rmse: ", rms)

rmse:  1.078927303113801


In [91]:
lr_model.coef_

array([12.9336724 , -0.44475265, -0.29257746,  0.42934266,  4.02552849,
       -5.20432194, -0.20669638,  0.        , -0.19004568,  0.        ,
       -1.88001326,  3.78628056,  2.59559277,  2.59559277, -0.67641905,
       -0.01924094, -1.42286462,  0.95519431,  3.41625759, -0.18035362,
       -0.12160798])

In [53]:

test = test.set_index('item_id').join(items.set_index('item_id'))
test.reset_index(inplace=True)

In [75]:
item_features = [ 
    'item_id_mean_encoding'
                ]

merge_col = ['item_id']
cols=item_features+merge_col

test = test.merge(training.drop_duplicates('item_id')[cols], on=merge_col, how='left')

In [76]:
shop_features = [
        'shop_id_mean_encoding'
]

merge_col = ['shop_id']
cols=shop_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [56]:
cat_features = [
        'item_category_id_mean_encoding'#,'cat_me_real'
]

merge_col = ['item_category_id']
cols=cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [57]:
shop_item_features = [
        'shop_item_mean_encoding'
]

merge_col = ['shop_id', 'item_id']
cols=shop_item_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [58]:
shop_cat_features = [
        'shop_cat_mean_encoding'
]

merge_col = ['shop_id', 'item_id']
cols=shop_cat_features+merge_col


test = test.merge(training.drop_duplicates(merge_col)[cols], on=merge_col, how='left')

In [59]:
def add_rolls_test(df, cols, name, rolls = [3]):
    for roll in rolls:
        print(name, roll)
        roll_name = name+"_rolling_" + str(roll)
        roll_name_tmp = roll_name + "_tmp"
        
        try:
            df.drop(columns=[roll_name],inplace=True)
        except:
            pass       

    
        block_units_rolling_temp = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].rolling(roll,min_periods=2).mean().reset_index()\
            .rename(columns={name:roll_name})\
            [cols+[roll_name]]
        
        print([cols[0:len(cols)-1]+[roll_name]])
        thirty_three = block_units_rolling_temp[block_units_rolling_temp['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1]+[roll_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')
    

        del block_units_rolling_temp
        gc.collect()
        

    
    return df
    

test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_rolls_test(test, ['item_id','date_block_num'], 'item_block_units')
test = add_rolls_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')

test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_median')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_min')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_max')
test = add_rolls_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_std')

item_block_mean 3
[['item_id', 'item_block_mean_rolling_3']]
item_block_units 3
[['item_id', 'item_block_units_rolling_3']]
shop_block_mean 3
[['shop_id', 'shop_block_mean_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]
shop_cat_block_units 3
[['shop_id', 'item_category_id', 'shop_cat_block_units_rolling_3']]
shop_cat_block_mean 3
[['shop_id', 'item_category_id', 'shop_cat_block_mean_rolling_3']]
shop_cat_block_median 3
[['shop_id', 'item_category_id', 'shop_cat_block_median_rolling_3']]
shop_cat_block_min 3
[['shop_id', 'item_category_id', 'shop_cat_block_min_rolling_3']]
shop_cat_block_max 3
[['shop_id', 'item_category_id', 'shop_cat_block_max_rolling_3']]
shop_cat_block_std 3
[['shop_id', 'item_category_id', 'shop_cat_block_std_rolling_3']]


In [60]:
test = add_rolls_test(test, ['item_category_id','date_block_num'], 'cat_block_mean')

cat_block_mean 3
[['item_category_id', 'cat_block_mean_rolling_3']]


In [61]:
test = add_rolls_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

shop_item_block_mean 3
[['shop_id', 'item_id', 'shop_item_block_mean_rolling_3']]


In [62]:
def add_lags_test(df, cols, name, lags = [1]):
    
    for lag in lags:
        print(name, lag)
        lag_name = name + "_lag_" + str(lag)
        
        try:
            df.drop(columns=[lag_name],inplace=True)
        except:
            pass       

        result = training\
            .drop_duplicates(cols)\
            .sort_values(cols)\
            .set_index(cols)\
            .groupby(cols[0:len(cols)-1],as_index=False)\
            [name].shift(lag)\
            .rename(columns={name:lag_name}).reset_index()
        
        thirty_three = result[result['date_block_num'] == 33].drop_duplicates(cols)\
                [cols[0:len(cols)-1] + [lag_name]]
        df = df.merge(thirty_three, on=cols[0:len(cols)-1], how='left')

        gc.collect()
    
    return df
                                         

                                        
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_mean')
test = add_lags_test(test, ['item_id','date_block_num'], 'item_block_units')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_mean')
test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_mean')

test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_max')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_std')

test = add_lags_test(test, ['shop_id','item_category_id','date_block_num'], 'shop_cat_block_units')

test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')

test = add_lags_test(test, ['item_id','date_block_num'], 'item_share_block')
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_share_block')
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'comp2')

item_block_mean 1
item_block_units 1
shop_block_mean 1
shop_cat_block_mean 1
shop_block_max 1
shop_block_std 1
shop_cat_block_units 1
shop_item_block_units 1
shop_item_block_mean 1
item_share_block 1
shop_share_block 1
comp2 1


In [63]:
test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_mean')
#test = add_lags_test(test, ['shop_id','item_id','date_block_num'], 'shop_item_block_units')

shop_item_block_mean 1


In [64]:
test = add_lags_test(test, ['shop_id','date_block_num'], 'shop_block_units')

shop_block_units 1


In [66]:
def predict(shop_id):
    return shop_models[shop_id].predict([[34]])[0][0]

test['shop_pred'] = test.apply(lambda row: predict(row['shop_id']), axis=1)
#training['shop_cat_pred'] = training.apply(lambda row: predict(row['shop_id'],row['item_category_id'], row['date_block_num']), axis=1)
#training['cat_pred'] = training.apply(lambda row: predict(row['item_category_id'], row['date_block_num']), axis=1)

In [67]:
def predict(shop_id, cat_id):
    if shop_id in shop_cat_models and cat_id in shop_cat_models[shop_id]:
        return shop_cat_models[shop_id][cat_id].predict([[34]])[0][0]

test['shop_cat_pred'] = test.apply(lambda row: predict(row['shop_id'],row['item_category_id']), axis=1)


In [68]:
def predict(cat_id):
    if cat_id in cat_models:
        return cat_models[cat_id].predict([[34]])[0][0]

test['cat_pred'] = test.apply(lambda row: predict(row['item_category_id']), axis=1)

In [72]:
def predict(shop_id, item_id):
    if shop_id in shop_item_models and item_id in shop_item_models[shop_id]:
        return shop_item_models[shop_id][item_id].predict([[34]])[0][0]

test['shop_item_pred'] = test.apply(lambda row: predict(row['shop_id'],row['item_id']), axis=1)

In [78]:
test['blocks_without_sales'] = test['item_id'].map(training.drop_duplicates('item_id')\
                    .set_index('item_id')['blocks_without_sales'])

In [82]:
test.fillna(0, inplace=True)

In [93]:
from sklearn.preprocessing import MinMaxScaler

test_scaled = MinMaxScaler().fit_transform(test[features])


  return self.partial_fit(X, y)


In [94]:
lr_preds = lr_model.predict(test_scaled)
lr_preds.clip(0,20,out=lr_preds)

array([0.        , 0.        , 0.03982443, ..., 0.29828232, 0.31526111,
       0.24284847])

In [95]:
print(np.mean(lr_preds))
print(np.max(lr_preds))

0.26448920651249935
20.0


In [None]:
np.mean(lg_preds.conc lr_preds, axis=1)

In [None]:
preds = np.mean(np.array([lg_preds, lr_preds]),axis=0)

In [96]:
submission = test.loc[:,['ID']]
submission['item_cnt_month'] = lr_preds

submission.to_csv('submission.csv', index=False)

In [None]:
lr_preds = pd.read_csv('lr111.csv')['item_cnt_month']
lg_preds = pd.read_csv('lg111.csv')['item_cnt_month']
cb_preds = pd.read_csv('cb102.csv')['item_cnt_month']


#preds = np.mean(np.array([lr_preds, lg_preds]),axis=0)

preds = (cb_preds * 0.7) + (lr_preds * 0.3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier
clf = RandomForestRegressor(n_estimators=10, random_state=0, n_jobs=8)

# Train the classifier
clf.fit(x_train[features], y_train)

# Print the name and gini importance of each feature
for feature in zip(features, clf.feature_importances_):
    print(feature)