In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import lightgbm as lgb
import statsmodels.api as sm
import matplotlib.pyplot as pyplot
from sklearn import preprocessing, metrics
import pickle
import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.model_selection import train_test_split
import datetime
import random
import warnings

## Permutation test
This test can be used to determine whether feature is important for the model or not. 
This could be achieved by random permutation of feature. Main idea here - if model predicts well with permuted feature values, it means that feature is not important. 

But there are two problems here:

First - as permutation is RANDOM, there could be luck and model could stil predicts well after permutation even if feature is importand and so we could throw away important feature.

Second - by whitch amount score should decline to say that feature is not important?  

This questions could be adressed by using Statcistics.

## Wilcoxon test
The Wilcoxon signed-rank test is a non-parametric statistical hypothesis test used to compare two related samples, matched samples, or repeated measurements on a single sample to assess whether their population mean ranks differ (i.e. it is a paired difference test).A Wilcoxon signed-rank test is a nonparametric test that can be used to determine whether two dependent samples were selected from populations having the same distribution.

Another words, we could use this test to determine whether resulted score after permutation is statisticaly the same as before? If same - the feature is not important, if differ - the feature is important!

To proceed with this test we need at least 20 samples (scores) before permutation and 20 scores after permutation.
This could be achieved by random sampling from the predicted values by model, so we will use ShuffleSplit from sklearn to avhieve it

## Overfitting features

Another problem in feature selection - there could feture that is important for the train part of the data but not important for the test part - it's because based on this feature model is overfiting on the thain part. The combination of Permutation-Wilcoxon test could by used here to determine such features

In [None]:
## All features for the model and all this features we will tested

MODEL_FEATURES = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1',
       'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX',
       'snap_WI', 'sell_price', 'year', 'quarter', 'month', 'week', 'day',
       'dayofweek', 'dayofyear', 'weekday', 'weekofyear', 'is_weekend',
       'lag_28', 'lag_29', 'lag_30', 'lag_31', 'lag_32', 'lag_33', 'lag_34',
       'lag_35', 'lag_36', 'lag_37', 'lag_38', 'lag_39', 'lag_40',
       'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
       'rolling_mean_30', 'rolling_std_30', 'rolling_mean_60',
       'rolling_std_60', 'rolling_mean_180', 'rolling_std_180',
       'rolling_mean_tmp_1_7', 'rolling_mean_tmp_1_14',
       'rolling_mean_tmp_1_30', 'rolling_mean_tmp_1_60',
       'rolling_mean_tmp_7_7', 'rolling_mean_tmp_7_14',
       'rolling_mean_tmp_7_30', 'rolling_mean_tmp_7_60',
       'rolling_mean_tmp_14_7', 'rolling_mean_tmp_14_14',
       'rolling_mean_tmp_14_30', 'rolling_mean_tmp_14_60', 'price_momentum',
       'price_momentum_m', 'price_momentum_y', 'lag_price_t1',
       'price_change_t1', 'rolling_price_max_t28', 'price_change_t28',
       'rolling_price_std_t7', 'diff_with_mean_price_by_cat',
       'diff_with_mean_price_by_dept', 'diff_with_mean_price_by_store',
       'diff_with_max_price_by_cat', 'diff_with_max_price_by_dept',
       'diff_with_max_price_by_store', 'diff_with_min_price_by_cat',
       'diff_with_min_price_by_dept', 'diff_with_min_price_by_store']

In [None]:
def get_base_test(pred=False):
    base_test = pd.DataFrame()
    
    if pred:
        name = '/kaggle/input/dark-magic-baseline/test_'
    else:
        name = '/kaggle/input/dark-magic-baseline/valid_'
    
    for store_id in STORES_IDS:
        temp_df = pd.read_pickle(name+str(store_id)+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test

In [None]:
def rmse(preds, true):
    return np.sqrt(metrics.mean_squared_error(preds, true))

### Get Permutation - Wilcoxon results for the train part

In [None]:
STORES_IDS = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
STORES_IDS = ['CA_1', 'TX_1', 'WI_1'] #to speed-up
TARGET = 'demand'
from sklearn.model_selection import ShuffleSplit
from scipy.stats import mannwhitneyu
from scipy.stats import wilcoxon

# random sampling
ss = ShuffleSplit(n_splits=20, test_size=0.25, random_state=42)

data = get_base_test()
data['store_id'] = data['store_id'].astype('category')
data = data[data.store_id.isin(STORES_IDS)]
data = data[data.date<='2016-03-27'] #train part
data['preds'] = np.float16(0)
data = data.reset_index()

estimators = {}
# get base predictions for the train part w/o permutation
for store_id in STORES_IDS:
    model_path = '/kaggle/input/dark-magic-baseline/lgb_model_'+str(store_id)+'.bin' 
    estimator = pickle.load(open(model_path, 'rb'))
    estimators[store_id] = estimator

for store_id in STORES_IDS:
    mask = data.store_id==store_id
    data['preds'][mask] = estimators[store_id].predict(data[mask][MODEL_FEATURES])

# get 20 random samples
base_line = []
for _, test_index in ss.split(data):
    tmp = data[data.index.isin(test_index)]
    base_line.append(np.sqrt(metrics.mean_squared_error(tmp['preds'], tmp['demand'])))

print('Starting permutation test for the train part')

feature_results = {}

for feature in MODEL_FEATURES:
    
    df = data.copy().reset_index()
        
    df[feature] = np.random.permutation(df[feature])
    if feature in ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1',
       'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX',
       'snap_WI',]:
            df[feature] = df[feature].astype('category') #to avoid error
        
    df['sflpred'] = 0.0
    
    #get predictions after permutation
    for store_id in STORES_IDS:
        mask = data.store_id==store_id
        df['sflpred'][mask] = estimators[store_id].predict(df[mask][MODEL_FEATURES])
    
    # get 20 random samples
    shufled_pred_for_feature = []
    for _, test_index in ss.split(df):
        tmp1 = df[df.index.isin(test_index)]
        shufled_pred_for_feature.append(np.sqrt(metrics.mean_squared_error(tmp1['sflpred'], tmp1['demand'])))
        
    
    stat, pvalue = wilcoxon(shufled_pred_for_feature, base_line)
    alpha = 0.05
    if pvalue > alpha:
        res = 'Not Significant' # same distribution => no impact
    else:
        res = 'Significant' # different distribution => have impact

    print('[Score baseline=%.4f Score shfl=%.4f]' % (sum(base_line)/len(base_line),  sum(shufled_pred_for_feature)/len(shufled_pred_for_feature)),
    '[Statistics=%.4f, p=%.4f]' % (stat, pvalue), f'[{res}]', f'[{feature}]')

    feature_results[feature] = res

### Get Permutation - Wilcoxon results for the test part

In [None]:
STORES_IDS = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']
STORES_IDS = ['CA_1', 'TX_1', 'WI_1'] #to speed-up
TARGET = 'demand'
from sklearn.model_selection import ShuffleSplit
from scipy.stats import mannwhitneyu
from scipy.stats import wilcoxon

ss = ShuffleSplit(n_splits=20, test_size=0.25, random_state=42)

data = get_base_test()
data['store_id'] = data['store_id'].astype('category')
data = data[data.store_id.isin(STORES_IDS)]
data = data[data.date>'2016-03-27']
data['preds'] = np.float16(0)
data = data.reset_index()

estimators = {}

for store_id in STORES_IDS:
    model_path = '/kaggle/input/dark-magic-baseline/lgb_model_'+str(store_id)+'.bin' 
    estimator = pickle.load(open(model_path, 'rb'))
    estimators[store_id] = estimator

for store_id in STORES_IDS:
    mask = data.store_id==store_id
    data['preds'][mask] = estimators[store_id].predict(data[mask][MODEL_FEATURES])

base_line = []
for _, test_index in ss.split(data):
    tmp = data[data.index.isin(test_index)]
    base_line.append(np.sqrt(metrics.mean_squared_error(tmp['preds'], tmp['demand'])))

print('Starting permutation test for validation part')

feature_pred_results = {}

for feature in MODEL_FEATURES:#['item_id']: 
    
    df = data.copy().reset_index()
    
#     if df[feature].dtypes.name != 'category':
    df[feature] = np.random.permutation(df[feature])
    if feature in ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1',
       'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX',
       'snap_WI',]:
            df[feature] = df[feature].astype('category')
        
    df['sflpred'] = 0.0
    
    for store_id in STORES_IDS:
        mask = df.store_id==store_id
        df['sflpred'][mask] = estimators[store_id].predict(df[mask][MODEL_FEATURES])
    
    shufled_pred_for_feature = []
    
    for _, test_index in ss.split(df):
        tmp1 = df[df.index.isin(test_index)]
        shufled_pred_for_feature.append(np.sqrt(metrics.mean_squared_error(tmp1['sflpred'], tmp1['demand'])))
        
        
    stat, pvalue = wilcoxon(shufled_pred_for_feature, base_line)
    alpha = 0.05
    if pvalue > alpha:
        res = 'Not Significant' # same distribution => no impact
    else:
        res = 'Significant' # different distribution => have impact

    print('[Score baseline=%.4f Score shfl=%.4f]' % (sum(base_line)/len(base_line),  sum(shufled_pred_for_feature)/len(shufled_pred_for_feature)),
    '[Statistics=%.4f, p=%.4f]' % (stat, pvalue), f'[{res}]', f'[{feature}]')

    feature_pred_results[feature] = res

### Determine overfitting features

In [None]:
for feat in feature_pred_results.keys():
    tr = feature_results[feat]
    pr = feature_pred_results[feat]
    print(f'train [{tr}] pred [{pr}] [{feat}]')