In [None]:
# --------------------------------------- 
# PYTHON IMPORTS
# --------------------------------------- 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
%matplotlib inline 

from scipy import stats
import numpy as np

from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBRegressor

In [None]:
# --------------------------------------- 
# LOAD COMPETITION DATA
# --------------------------------------- 

In [None]:
DATA_FOLDER = '/kaggle/input/competitive-data-science-predict-future-sales'

train           = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'))
test            = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))
submission      = pd.read_csv(os.path.join(DATA_FOLDER, 'sample_submission.csv'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_cats       = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))

#convert data to dataTime format
train['dateTime'] = pd.to_datetime(train['date'], format='%d.%m.%Y')

In [None]:
# --------------------------------------- 
# DATA EXPLORATION
# --------------------------------------- 

In [None]:
# The competition is about predicting november 2015 sales, per item-shop pairs.

# The training set however describe all sales for a few years back (from jan-2013 to oct-2015).
# let's take a closer look at the range of the time serie we have as a train set.

In [None]:
train.groupby(train["dateTime"].dt.year).count().plot(kind="bar")

In [None]:
#per year chart
train15 = train[(train['dateTime'].dt.year == 2015)]
train14 = train[(train['dateTime'].dt.year == 2014)]
train13 = train[(train['dateTime'].dt.year == 2013)]

train15.groupby(train15["dateTime"].dt.month).count().plot(kind="bar")
train14.groupby(train14["dateTime"].dt.month).count().plot(kind="bar")
train13.groupby(train13["dateTime"].dt.month).count().plot(kind="bar")

In [None]:
#Let's now see what the test set is about.
test.head()

In [None]:
#As test set do not contain any date and train test stop at october 2015 we can assume that we are tasked to predict november 2015.
#and test is a mapping from ID to [shop_id,item_id] usefull to submit prediction as `submission` is indexed by ID.


In [None]:
#Let's explore a bit more, around the pair from test set:
transactionsWithCategory = pd.merge(train, items, on='item_id', how='left')

transactionsWithCategory = transactionsWithCategory.drop(columns='item_name') # dropping item_name it is redundant with item_id.
transactionsWithCategory = transactionsWithCategory.drop(columns='date')      # dropping data it is redundant dateTime.
transactionsWithCategory = transactionsWithCategory.drop(columns='date_block_num') # dropping date_block_num it is probably usefull but let's select our fight to gain some time.

transactionsWithCategory.hist(column='shop_id')
transactionsWithCategory.hist(column='item_id')

In [None]:
print('num shops', transactionsWithCategory.groupby('shop_id')['shop_id'].nunique().shape)
print('num items', transactionsWithCategory.groupby('item_id')['item_id'].nunique().shape)

In [None]:
#Both 'shop_id' and 'item_id' are categorical feature, we got 60 unique shop and 21807 unique item.

#Thus it make sense to one hot encode shop_id (we will do that latter).
#However item_id would require more care for memory and performance reasons and won't be adress in this notebook for time reasons.

In [None]:
#Let's now look at prices and volumes of sales
transactionsWithCategory.hist(column='item_price')
transactionsWithCategory.hist(column='item_cnt_day')

In [None]:
#It appear that the distribution is quite spiky in both case:
#- A lot of item are in the range of 0-50000 price while few goes up to 300000, this could be either bad data or very rare case.
#--> It might be interresting to use a log scale, run two different network or simply remove the outlier here.

#- A lot of item are sold less than 200 time a day while very few 2000 time a day, this could be either bad data or very rare case.
#--> It might be interresting to see if separating the outliers would have a positive impact on prediction here, if so 2 network is a posibility.

In [None]:
#Let's define a function to clean transaction outlier based on Z-score.
def removeOutlierUsingZScore(dataToClean, zScoreThreshold, columNameToClean):
    z = np.abs(stats.zscore(dataToClean[columNameToClean]))
    dataCleaned = dataToClean[(z < zScoreThreshold)]
    removedItems = dataToClean[(z >= zScoreThreshold)]
    return dataCleaned, removedItems

In [None]:
zScorePrice = 5
zScoreCount = 50

#Let's clean the data using some heuristic (manually chosen z-score)
#The heuristic is arbitrary and might create problem for the items we remove, however we will see latter that it is beneficial overall.
transactionCleanPrice, transactionOutlierPrice = removeOutlierUsingZScore(transactionsWithCategory, zScorePrice, 'item_price')
transactionCleanCount, transactionOutlierCount = removeOutlierUsingZScore(transactionsWithCategory, zScoreCount, 'item_cnt_day')
transactionClean, transactionOutlier = removeOutlierUsingZScore(transactionCleanPrice, zScoreCount, 'item_cnt_day')

transactionOutlier = pd.concat([transactionOutlier, transactionOutlierCount])

In [None]:
#Let's see what we did:
print('Source shape:',transactionsWithCategory.shape)

print('------------------------------')
print('Cleaned shape:',transactionClean.shape)
transactionClean.hist(column='item_price')
transactionClean.hist(column='item_cnt_day')

In [None]:
print('Outlier shape:',transactionOutlier.shape)
transactionOutlier.hist(column='item_price')
transactionOutlier.hist(column='item_cnt_day')

In [None]:
#This look like much better distributions, great!

In [None]:
# --------------------------------------- 
# FEATURES GENERATION - STEP1 - Generating 10months laggy feature for item count and price.
# --------------------------------------- 

In [None]:
#We have seen that we need to predict november 2015 sales (at least we assume so based on data exploration).

#An option would thus be to train a model to predict november sales using stats from current year from january to october.
#this mean that we will train on jan-oct for year 2013 and 2014 were we have november sales and evaluate on jan-oct year 2015 were we only have jan-oct data.

#We will see latter in the notebook that generating more laggy data (aka 4 months, 6 months etc) is also gonna be helpfull.
#PS: It would probably be interesting to try to capture the trend from years to years to extrapolate better from 2013-2014 to 215 however i did not tied.

In [None]:
transactionClean.head()

In [None]:
#Let's collect all data for each year from 1st janurary to end of october sales.

#PS: nomenclature here is train_<year>_<c for clean/o for outlier>_<numberOfMonthForLaggyData>

train_15_c_10 = transactionClean[(transactionClean['dateTime'].dt.year == 2015) & (transactionClean['dateTime'].dt.month <= 10)]
train_14_c_10 = transactionClean[(transactionClean['dateTime'].dt.year == 2014) & (transactionClean['dateTime'].dt.month <= 10)]
train_13_c_10 = transactionClean[(transactionClean['dateTime'].dt.year == 2013) & (transactionClean['dateTime'].dt.month <= 10)]
train_15_o_10 = transactionOutlier[(transactionOutlier['dateTime'].dt.year == 2015) & (transactionOutlier['dateTime'].dt.month <= 10)]
train_14_o_10 = transactionOutlier[(transactionOutlier['dateTime'].dt.year == 2014) & (transactionOutlier['dateTime'].dt.month <= 10)]
train_13_o_10 = transactionOutlier[(transactionOutlier['dateTime'].dt.year == 2013) & (transactionOutlier['dateTime'].dt.month <= 10)]

In [None]:
#Here is an helper function to generate laggy feature (aka how many item per shop were sold for a given time range, for example jan-oct)

In [None]:
def prepareLaggyData(dataName, dataOnPeriod, testData, submissionData):
    print('-------------------------')
    print(dataName)
    print('dataOnPeriod', dataOnPeriod.shape, '\n')
    
    #Dropping date we will agregate on the period and loss it anyway.
    dataOnPeriod = dataOnPeriod.drop(columns='dateTime')
    
    #Only keep ['item_id','shop_id'] pairs that exist in the testData, rest are probably items not sold anymore or
    #not existing shop, and we have no way to map them to an ID for submission.
    dataOnPeriodWithExistingID = pd.merge(dataOnPeriod, testData, on=['item_id','shop_id'])
    print('dataOnPeriodWithExistingID', dataOnPeriodWithExistingID.shape)
    
    #Now let's generate our laggy features -->
    
    #Total amount of sell (per ID)
    pairCnt = dataOnPeriodWithExistingID[['ID','item_cnt_day']]
    pairCntSummed = pairCnt.groupby('ID').sum().astype(int)
    pairCntSummed = pairCntSummed.rename(columns={"item_cnt_day": "item_cnt_laggy"})
    print('pairCntSummed', pairCntSummed.shape)
    numNan = pairCntSummed.isna().sum()["item_cnt_laggy"]
    if (numNan != 0):
        print('NAN FOUND! THIS NEED TO BE ADRESSED')
        
    #Avg price on period (per ID)
    pairPrice = dataOnPeriodWithExistingID[['ID','item_price']]
    pairPriceAvg = pairPrice.groupby('ID').mean().astype(float)
    pairPriceAvg = pairPriceAvg.rename(columns={"item_price": "item_price_laggy"})
    print('pairPriceAvg', pairPriceAvg.shape)
    numNan = pairPriceAvg.isna().sum()["item_price_laggy"]
    if (numNan != 0):
        print('NAN FOUND! THIS NEED TO BE ADRESSED')
        
    #Create laggy feature so it all ID from submission (fill NaNs from missing rows with 0s as int)
    submissionID = submissionData['ID'].to_frame(name='ID')
    laggyFeatures = submissionID.merge(pairCntSummed, on = 'ID', how='left')
    laggyFeatures = laggyFeatures.merge(pairPriceAvg, on = 'ID', how='left')
    numNan2 = laggyFeatures.isna().sum()['item_cnt_laggy']
    print('source data covered:',100-(int)(100*numNan2/submissionID.shape[0]), '% of submission IDs, rest will be filled with 0s.')
    finalLaggyFeatures = laggyFeatures.fillna(0.0).astype(int)
    print('finalLaggyFeatures', finalLaggyFeatures.shape)
    return finalLaggyFeatures
    

In [None]:
#PS: nomenclature here is trainLaggy_<year>_<c for clean/o for outlier>_<numberOfMonthForLaggyData>
trainLaggy_15_c_10 = prepareLaggyData('train_15_c_10', train_15_c_10, test, submission)
trainLaggy_14_c_10 = prepareLaggyData('train_14_c_10', train_14_c_10, test, submission)
trainLaggy_13_c_10 = prepareLaggyData('train_13_c_10', train_13_c_10, test, submission)
trainLaggy_15_o_10 = prepareLaggyData('train_15_o_10', train_15_o_10, test, submission)
trainLaggy_14_o_10 = prepareLaggyData('train_14_o_10', train_14_o_10, test, submission)
trainLaggy_13_o_10 = prepareLaggyData('train_13_o_10', train_13_o_10, test, submission)

In [None]:
#Few really interresting thing here:
#* We don't have data for all items
#--> A lot of them were probably neither sold by for the [item_id,shop_id] combination.
#PS: An interresting experiment would be to see if each year cover a different part of the submission IDs and use this 
#    info to generate better prediction using coverage based logic when stacking models, we won't do this in this notebook however.

#* We can also see that year 2015 cover more IDs than 2014, itself having a better cover than 2013.
#--> It is possible that some item/shop pair we are trying to predict in 2015 were not existing in the past.

#* We can see that removed outliers account for a very low percent of the submission ideas
#--> We can probably discard this data completely.

In [None]:
def countCategoricalFeatureUniqueItems(dataName, data):
    print('---------', dataName, '----------')
    print('num shops', data.groupby('shop_id')['shop_id'].nunique().shape)
    print('num category', data.groupby('item_category_id')['item_category_id'].nunique().shape)
    print('num item', data.groupby('item_id')['item_id'].nunique().shape)

In [None]:
def enrichLaggyData(data, test, items):
    tmpData = data.merge(test, on = 'ID', how='left')
    tmpData = tmpData.merge(items, on='item_id', how='left')
    tmpData = tmpData.drop(columns="item_name")
    return tmpData

In [None]:
#Let's add back some data to those laggy features.
trainLaggy_15_c_10_tmp = enrichLaggyData(trainLaggy_15_c_10, test, items)
trainLaggy_14_c_10_tmp = enrichLaggyData(trainLaggy_14_c_10, test, items)
trainLaggy_13_c_10_tmp = enrichLaggyData(trainLaggy_13_c_10, test, items)

#Let's see it if make sense to convert categorical features using simple one hot encoding.
countCategoricalFeatureUniqueItems('trainLaggy_15_c_10', trainLaggy_15_c_10_tmp)
countCategoricalFeatureUniqueItems('trainLaggy_14_c_10', trainLaggy_14_c_10_tmp)
countCategoricalFeatureUniqueItems('trainLaggy_13_c_10', trainLaggy_13_c_10_tmp)

In [None]:
#Ok it does for shop and category, as seen before item_id would require more care and i'm letting it as an int for time reasons.
#Let's define an helper for this:
def oneHotEncodeCategoricalFeature(data, categoryName):
    onehot = pd.get_dummies(data[categoryName], prefix=categoryName)
    dataWithOneHot = pd.concat([data, onehot], 1)
    dataWithOneHot = dataWithOneHot.drop(columns=categoryName)
    return dataWithOneHot
    


In [None]:
trainLaggy_15_c_10_tmp = oneHotEncodeCategoricalFeature(trainLaggy_15_c_10_tmp, 'shop_id')
trainLaggy_14_c_10_tmp = oneHotEncodeCategoricalFeature(trainLaggy_14_c_10_tmp, 'shop_id')
trainLaggy_13_c_10_tmp = oneHotEncodeCategoricalFeature(trainLaggy_13_c_10_tmp, 'shop_id')
trainLaggy_15_c_10_final = oneHotEncodeCategoricalFeature(trainLaggy_15_c_10_tmp, 'item_category_id')
trainLaggy_14_c_10_final = oneHotEncodeCategoricalFeature(trainLaggy_14_c_10_tmp, 'item_category_id')
trainLaggy_13_c_10_final = oneHotEncodeCategoricalFeature(trainLaggy_13_c_10_tmp, 'item_category_id')

In [None]:
#Let's see what we have now:
trainLaggy_15_c_10_final.head()

In [None]:
# --------------------------------------- 
# PREDICTION/SUBMISSION - STEP1/2 - Using 10 month laggy data.
# --------------------------------------- 

In [None]:
#Let's add november target predictions to 2013 and 2014 laggy data, so we can fit a model on them.
transactionNov14 = train[(train['dateTime'].dt.year == 2014) & (train['dateTime'].dt.month == 11)]
transactionNov13 = train[(train['dateTime'].dt.year == 2013) & (train['dateTime'].dt.month == 11)]
targetPredictionNov14 = prepareLaggyData('transactionNov14', transactionNov14, test, submission).drop(columns='item_price_laggy')
targetPredictionNov13 = prepareLaggyData('transactionNov13', transactionNov13, test, submission).drop(columns='item_price_laggy')

targetPredictionNov14 = targetPredictionNov14.rename(columns={'item_cnt_laggy':'item_cnt_month'})
targetPredictionNov13 = targetPredictionNov13.rename(columns={'item_cnt_laggy':'item_cnt_month'})

targetPredictionNov14.head()

In [None]:
#We can see that a lot of item were not sold on november, however this is fine as we are tasked into predicting exactly that (aka november sales).

In [None]:
#For now let's have two models one for year 13 and one for year 14.
model13_10months = XGBRegressor()
model14_10months = XGBRegressor()

model13_10months.fit(trainLaggy_13_c_10_final, targetPredictionNov13['item_cnt_month'])
model14_10months.fit(trainLaggy_14_c_10_final, targetPredictionNov14['item_cnt_month'])

prediction15From13_10months = model13_10months.predict(trainLaggy_15_c_10_final)
prediction15From14_10months = model14_10months.predict(trainLaggy_15_c_10_final)

print(prediction15From13_10months.shape)
print(prediction15From14_10months.shape)

#For now final prediction for 2015 will be the average from both model(one trained on 2013 and one trained on 2014 data).
prediction15_10months = (prediction15From13_10months + prediction15From14_10months) / 2

submission.head()

In [None]:
#Let's try a submission with the above.
submission['item_cnt_month'] = prediction15_10months
submission.head()
submission.to_csv("prediction15_10months.csv", index= False)
#score of 1.95

In [None]:
#Let's try a submission with clamp value from the course / the evaluation page of the competition.
submissionClipped = submission
submissionClipped['item_cnt_month'] = submissionClipped['item_cnt_month'].clip(0,20)
submissionClipped.to_csv("prediction15_10months_clipped.csv", index= False)
#score of 1.21, it is thus very important to always clip the result to match what we are tested on.

In [None]:
# --------------------------------------- 
# CROSS VALIDATION STEP 1/3 - Linear stacking of model from 2013 and 2014 datas
# --------------------------------------- 

In [None]:
#To do cross validation we will try to do exactly as above (aka 10month laggy on november), but instead evaluate on october 2015.
#the reason is that we can have exactly the same process however we actually have october data for 2015. Also this should prevent
#overfitting to the public test set from the competition submissions as we don't actually use those value but rather optimize for
#the general scheme of the competition (well on october so we can still overfit to october, but well time is limited so i will take this risk).


In [None]:
#target prediction for 2013-2015
transactionOct15 = train[(train['dateTime'].dt.year == 2015) & (train['dateTime'].dt.month == 10)]
transactionOct14 = train[(train['dateTime'].dt.year == 2014) & (train['dateTime'].dt.month == 10)]
transactionOct13 = train[(train['dateTime'].dt.year == 2013) & (train['dateTime'].dt.month == 10)]
targetPredictionOct15 = prepareLaggyData('transactionOct15', transactionOct15, test, submission).drop(columns='item_price_laggy')
targetPredictionOct14 = prepareLaggyData('transactionOct14', transactionOct14, test, submission).drop(columns='item_price_laggy')
targetPredictionOct13 = prepareLaggyData('transactionOct13', transactionOct13, test, submission).drop(columns='item_price_laggy')
targetPredictionOct15 = targetPredictionOct15.rename(columns={'item_cnt_laggy':'item_cnt_month'})
targetPredictionOct14 = targetPredictionOct14.rename(columns={'item_cnt_laggy':'item_cnt_month'})
targetPredictionOct13 = targetPredictionOct13.rename(columns={'item_cnt_laggy':'item_cnt_month'})


In [None]:
#Let's first define an helper method based on the above work:
def prepareTrainData(name, year, monthStart, monthStop):
    trainDataTmp = transactionClean[(transactionClean['dateTime'].dt.year == year) & (transactionClean['dateTime'].dt.month <= monthStop) & (transactionClean['dateTime'].dt.month >= monthStart)]
    trainDataTmp = prepareLaggyData(name, trainDataTmp, test, submission)
    trainDataTmp = enrichLaggyData(trainDataTmp, test, items)
    trainDataTmp = oneHotEncodeCategoricalFeature(trainDataTmp, 'shop_id')
    trainDataTmp = oneHotEncodeCategoricalFeature(trainDataTmp, 'item_category_id')
    return trainDataTmp
    

In [None]:
crossTrain_15_c_9 = prepareTrainData('crossTrain_15_c_9',2015,1,9)
crossTrain_14_c_9 = prepareTrainData('crossTrain_14_c_9',2014,1,9)
crossTrain_13_c_9 = prepareTrainData('crossTrain_13_c_9',2013,1,9)

In [None]:
#Let's also have a bit more helper methods:
def fitXGBR(data, target):
    modelTmp = XGBRegressor()
    modelTmp.fit(data, target['item_cnt_month'])
    return modelTmp

def testSubmission(prediction, target):
    targetClipped = target
    targetClipped['item_cnt_month'] = targetClipped['item_cnt_month'].clip(0,20)
    
    submissionClipped = submission
    submissionClipped['item_cnt_month'] = prediction
    submissionClipped['item_cnt_month'] = submissionClipped['item_cnt_month'].clip(0,20)
   
    rmse = ((submission - targetClipped) ** 2).mean() ** .5
    return rmse['item_cnt_month']

In [None]:
oct_model14_9months = fitXGBR(crossTrain_14_c_9, targetPredictionOct14)
oct_model13_9months = fitXGBR(crossTrain_13_c_9, targetPredictionOct13)


In [None]:
oct_prediction15From13_9months = oct_model14_9months.predict(crossTrain_15_c_9)
oct_prediction15From14_9months = oct_model13_9months.predict(crossTrain_15_c_9)

oct_prediction15_9months = (oct_prediction15From13_9months + oct_prediction15From14_9months) / 2

rmse = testSubmission(oct_prediction15_9months, targetPredictionOct15)
print("RMSE = ", rmse)

In [None]:
#Let's try to see if we can find better than avg of year 2013 and 2014.
rmse13 = testSubmission(oct_prediction15From13_9months, targetPredictionOct15)
rmse14 = testSubmission(oct_prediction15From14_9months, targetPredictionOct15)
print("1.0 / 2014 -> RMSE = ", rmse14)
print("0.0 / 2013 -> RMSE = ", rmse13)

print("----------")
for i in range(0,10):
    w = i/10.0
    testpred  = oct_prediction15From13_9months * w
    testpred += oct_prediction15From14_9months * (1-w)
    rmse = testSubmission(testpred, targetPredictionOct15)
    print(w,"-> RMSE = ", rmse)

In [None]:
print("---------- (range 0.7-0.9)")
for i in range(0,20):
    w = i/100.0 + 0.7
    testpred  = oct_prediction15From13_9months * w
    testpred += oct_prediction15From14_9months * (1-w)
    rmse = testSubmission(testpred, targetPredictionOct15)
    print(w,"-> RMSE = ", rmse)

In [None]:
#So the best way to linearly interpolate between those two model seems to be : 2013*w+2014*(1-w) with W = 0.85.
#Let's define a fast helper then:
def getFinalPred(prediction15From13,prediction15From14):
    w = 0.85
    return prediction15From13 * w + prediction15From14 * (1-w)

print(testSubmission(getFinalPred(oct_prediction15From13_9months,oct_prediction15From14_9months), targetPredictionOct15))

In [None]:
# --------------------------------------- 
# FEATURES GENERATION - STEP2/2 - Adding 7months, 4months and 1 month laggy price and count
# --------------------------------------- 

In [None]:
def generateExtraLaggyFeature(name, year, monthStart, monthStop):
    trainDataTmp = transactionClean[(transactionClean['dateTime'].dt.year == year) & (transactionClean['dateTime'].dt.month <= monthStop) & (transactionClean['dateTime'].dt.month >= monthStart)]
    trainDataTmp = prepareLaggyData(name, trainDataTmp, test, submission)
    trainDataTmp = trainDataTmp.rename(columns={'item_cnt_laggy':'item_cnt_'+name})
    trainDataTmp = trainDataTmp.rename(columns={'item_price_laggy':'item_price_'+name})
    return trainDataTmp


In [None]:
def addExtraLaggyFeatures(trainDataWithLaggy10month, year, monthStop):

    laggyFeature_7months = generateExtraLaggyFeature('7months',year, monthStop-7,monthStop)
    laggyFeature_4months = generateExtraLaggyFeature('4months',year, monthStop-4,monthStop)
    laggyFeature_1month  = generateExtraLaggyFeature('1month' ,year, monthStop,monthStop)

    trainAllLaggy = trainDataWithLaggy10month.merge(laggyFeature_7months, on = 'ID', how='left')
    trainAllLaggy = trainAllLaggy.merge(laggyFeature_4months, on = 'ID', how='left')
    trainAllLaggy = trainAllLaggy.merge(laggyFeature_1month, on = 'ID', how='left')
    return trainAllLaggy

In [None]:
#Let's try to have more generated feature for the models, aka various laggy length.
#before we had for 10 month: jan-oct aka 1-10
#let's add 7 months: avril-oct aka 4-10
#let's add 4 months: july-oct aka 7-10
#let's add 1 month: october aka 10

trainAllLaggy_13 = addExtraLaggyFeatures(trainLaggy_13_c_10_final, 2013, 10)
trainAllLaggy_14 = addExtraLaggyFeatures(trainLaggy_14_c_10_final, 2014, 10)
trainAllLaggy_15 = addExtraLaggyFeatures(trainLaggy_15_c_10_final, 2015, 10)

In [None]:
def clipPrediction(pred):
    return pd.DataFrame(pred, columns=["item_cnt_month"]).clip(0,20)

In [None]:
#Let's fit those new training sets, and predict on clipped target value (as anyway we want to predict in that range).
targetPredictionNov14Clipped = clipPrediction(targetPredictionNov14) 
targetPredictionNov13Clipped = clipPrediction(targetPredictionNov13) 
model14_AllLaggy = fitXGBR(trainAllLaggy_13, targetPredictionNov14Clipped)
model13_AllLaggy = fitXGBR(trainAllLaggy_14, targetPredictionNov13Clipped)

In [None]:
#And predict 2015 from the 2 new models.
prediction15From13_AllLaggy = model13_AllLaggy.predict(trainAllLaggy_15)
prediction15From14_AllLaggy = model14_AllLaggy.predict(trainAllLaggy_15)
prediction15From13_AllLaggyClipped = clipPrediction(prediction15From13_AllLaggy)
prediction15From14_AllLaggyClipped = clipPrediction(prediction15From14_AllLaggy)

In [None]:
predAllLaggy = getFinalPred(prediction15From13_AllLaggyClipped,prediction15From14_AllLaggyClipped)
submission['item_cnt_month'] = predAllLaggy
submission.to_csv("allLagSubTest.csv", index= False)
# score 1.20790

In [None]:
def dropNonItemCountFeature(data):
    dataSimple = data.drop(columns='ID')
    dataSimple = dataSimple.drop(columns=list(dataSimple.filter(regex='item_id')))
    dataSimple = dataSimple.drop(columns=list(dataSimple.filter(regex='shop_id')))
    dataSimple = dataSimple.drop(columns=list(dataSimple.filter(regex='item_category_id')))
    dataSimple = dataSimple.drop(columns=list(dataSimple.filter(regex='item_price')))
    return dataSimple

In [None]:
#Let's see if a simpler dataset could give better result.
trainAllLaggy_15_simple = dropNonItemCountFeature(trainAllLaggy_15)
trainAllLaggy_14_simple = dropNonItemCountFeature(trainAllLaggy_14)
trainAllLaggy_13_simple = dropNonItemCountFeature(trainAllLaggy_13)

In [None]:
model14_AllLaggySimple = fitXGBR(trainAllLaggy_13_simple, targetPredictionNov13Clipped)
model13_AllLaggySimple = fitXGBR(trainAllLaggy_14_simple, targetPredictionNov14Clipped)
prediction15From13_AllLaggySimple = model13_AllLaggySimple.predict(trainAllLaggy_15_simple)
prediction15From14_AllLaggySimple = model14_AllLaggySimple.predict(trainAllLaggy_15_simple)

In [None]:
prediction15From13_AllLaggySimpleClipped = clipPrediction(prediction15From13_AllLaggySimple)
prediction15From14_AllLaggySimpleClipped = clipPrediction(prediction15From14_AllLaggySimple)

In [None]:
predAllLaggySimple = getFinalPred(prediction15From13_AllLaggySimpleClipped,prediction15From14_AllLaggySimpleClipped)
submission['item_cnt_month'] = predAllLaggySimple
submission.to_csv("allLaggySimpleSubTest.csv", index= False)
#Score 1.03724  --> something in the extra data is preventing the model to perform well, interresting!

In [None]:
# --------------------------------------- 
# CROSS VALIDATION STEP 2/3 - Selecting the best features set.
# ---------------------------------------

In [None]:
trainLaggy_13_c_10_final.head()

In [None]:
#Let's use cross validation to find what is the best features to use, all of them is not great but a subset is probably good.
crossTrain_AllLaggy_15 = addExtraLaggyFeatures(crossTrain_15_c_9, 2015, 9)
crossTrain_AllLaggy_14 = addExtraLaggyFeatures(crossTrain_14_c_9, 2014, 9)
crossTrain_AllLaggy_13 = addExtraLaggyFeatures(crossTrain_13_c_9, 2013, 9)
targetPredictionOct15Clipped = clipPrediction(targetPredictionOct15)
targetPredictionOct14Clipped = clipPrediction(targetPredictionOct14)
targetPredictionOct13Clipped = clipPrediction(targetPredictionOct13)

In [None]:
crossModel14_AllLaggy = fitXGBR(crossTrain_AllLaggy_14, targetPredictionOct14Clipped)
crossMode13_AllLaggy = fitXGBR(crossTrain_AllLaggy_13, targetPredictionOct13Clipped)

In [None]:
def trainPredictAndEvalute(ID=True, item_id=True, item_category=True, shop_id=True):
    #prepare data
    testData15 = crossTrain_AllLaggy_15
    testData14 = crossTrain_AllLaggy_14
    testData13 = crossTrain_AllLaggy_13
    if ID==False:
        testData15 = testData15.drop(columns=list(testData15.filter(regex='ID')))
        testData14 = testData14.drop(columns=list(testData14.filter(regex='ID')))
        testData13 = testData13.drop(columns=list(testData13.filter(regex='ID')))    
    if item_id==False:
        testData15 = testData15.drop(columns=list(testData15.filter(regex='item_id')))
        testData14 = testData14.drop(columns=list(testData14.filter(regex='item_id')))
        testData13 = testData13.drop(columns=list(testData13.filter(regex='item_id')))    
    if item_category==False:
        testData15 = testData15.drop(columns=list(testData15.filter(regex='item_category')))
        testData14 = testData14.drop(columns=list(testData14.filter(regex='item_category')))
        testData13 = testData13.drop(columns=list(testData13.filter(regex='item_category')))    
    if shop_id==False:
        testData15 = testData15.drop(columns=list(testData15.filter(regex='shop_id')))
        testData14 = testData14.drop(columns=list(testData14.filter(regex='shop_id')))
        testData13 = testData13.drop(columns=list(testData13.filter(regex='shop_id')))    
    
    #print(testData13.columns)
    #train
    testModel14 = fitXGBR(testData14, targetPredictionOct14Clipped)
    testModel13 = fitXGBR(testData13, targetPredictionOct13Clipped)
    #predict
    pred13 = testModel14.predict(testData15)
    pred14 = testModel13.predict(testData15)
    pred = getFinalPred(pred13, pred14)
    #rmse
    rmse = testSubmission(pred, targetPredictionOct15Clipped)
    print("ID=", ID, "/ item_id=", item_id, "/ item_category=", item_category, "/ shop_id=", shop_id, " ==> RMSE: ", rmse)
    

In [None]:
#With all extra feature 
#trainPredictAndEvalute()

#^ Commented as quite slow to compute, output is:
#ID= True / item_id= True / item_category= True / shop_id= True  ==> RMSE:  0.9492638952203369

In [None]:
#No extra data
#trainPredictAndEvalute(ID=False, item_id=False, item_category=False, shop_id=False)

#Commented as quite slow to compute, output is:
#ID= False / item_id= False / item_category= False / shop_id= False  ==> RMSE:  0.9273928270147149

In [None]:
#One extra feature
#trainPredictAndEvalute(ID=True , item_id=False, item_category=False, shop_id=False)
#trainPredictAndEvalute(ID=False, item_id=True , item_category=False, shop_id=False)
#trainPredictAndEvalute(ID=False, item_id=False, item_category=True , shop_id=False)
#trainPredictAndEvalute(ID=False, item_id=False, item_category=False, shop_id=True )

#^ Commented as quite slow to compute, output is:
#ID= True / item_id= False / item_category= False / shop_id= False  ==> RMSE:  0.9168895917744182
#ID= False / item_id= True / item_category= False / shop_id= False  ==> RMSE:  0.9573129684418232
#ID= False / item_id= False / item_category= True / shop_id= False  ==> RMSE:  0.9128995565549166
#ID= False / item_id= False / item_category= False / shop_id= True  ==> RMSE:  0.9149498618412979

In [None]:
#Two extra features
#trainPredictAndEvalute(ID=False, item_id=False, item_category=True , shop_id=True )
#trainPredictAndEvalute(ID=False, item_id=True , item_category=False, shop_id=True )
#trainPredictAndEvalute(ID=True , item_id=False, item_category=False, shop_id=True )
#trainPredictAndEvalute(ID=False, item_id=True , item_category=True , shop_id=False)
#trainPredictAndEvalute(ID=True , item_id=False, item_category=True , shop_id=False)
#trainPredictAndEvalute(ID=True , item_id=True , item_category=False, shop_id=False)

#^ Commented as quite slow to compute, output is:
#ID= False / item_id= False / item_category= True / shop_id= True  ==> RMSE:  0.9037315880761034
#ID= False / item_id= True / item_category= False / shop_id= True  ==> RMSE:  0.9473394881388407
#ID= True / item_id= False / item_category= False / shop_id= True  ==> RMSE:  0.9136952566849936
#ID= False / item_id= True / item_category= True / shop_id= False  ==> RMSE:  0.957341665651518
#ID= True / item_id= False / item_category= True / shop_id= False  ==> RMSE:  0.9072651151695039
#ID= True / item_id= True / item_category= False / shop_id= False  ==> RMSE:  0.95130676158712

In [None]:
#3 extra features
#trainPredictAndEvalute(ID=True , item_id=True , item_category=True , shop_id=False)
#trainPredictAndEvalute(ID=True , item_id=True , item_category=False, shop_id=True )
#trainPredictAndEvalute(ID=True , item_id=False, item_category=True , shop_id=True )
#trainPredictAndEvalute(ID=False, item_id=True , item_category=True , shop_id=True )

#^ Commented as quite slow to compute, output is:
#ID= True / item_id= True / item_category= True / shop_id= False  ==> RMSE:  0.9567579927032015
#ID= True / item_id= True / item_category= False / shop_id= True  ==> RMSE:  0.9415430168587506
#ID= True / item_id= False / item_category= True / shop_id= True  ==> RMSE:  0.9123035362411145
#ID= False / item_id= True / item_category= True / shop_id= True  ==> RMSE:  0.9527056930511119

In [None]:
#It appears that the best configuration we can have is using `shop_id` and `item_category` features (ie dropping `ID` and `item_id`).
#That make sense as the kept features are the one that were one hot encoded properly!

#An interresting fact is that `ID` alone give quite a good result, this might be a sign that ID is not random and could potentially be investigated for data leakage (maybe category can be infer from it or something else?)

In [None]:
testData15 = crossTrain_AllLaggy_15
testData14 = crossTrain_AllLaggy_14
testData13 = crossTrain_AllLaggy_13
testData15 = testData15.drop(columns=list(testData15.filter(regex='ID')))
testData14 = testData14.drop(columns=list(testData14.filter(regex='ID')))
testData13 = testData13.drop(columns=list(testData13.filter(regex='ID')))    
testData15 = testData15.drop(columns=list(testData15.filter(regex='item_id')))
testData14 = testData14.drop(columns=list(testData14.filter(regex='item_id')))
testData13 = testData13.drop(columns=list(testData13.filter(regex='item_id')))    

crossTrain_AllLaggy_15_final = testData15
crossTrain_AllLaggy_14_final = testData14
crossTrain_AllLaggy_13_final = testData13

crossTrain_AllLaggy_13_final.head(2)

In [None]:
#Before moving further let's do the same check with our laggy data features:

In [None]:
def trainPredictAndEvaluateLaggyData(Count = True, Price=True):
    #prepare data
    testData15 = crossTrain_AllLaggy_15_final
    testData14 = crossTrain_AllLaggy_14_final
    testData13 = crossTrain_AllLaggy_13_final
    if Count==False:
        testData15 = testData15.drop(columns=list(testData15.filter(regex='cnt')))
        testData14 = testData14.drop(columns=list(testData14.filter(regex='cnt')))
        testData13 = testData13.drop(columns=list(testData13.filter(regex='cnt')))    
    if Price==False:
        testData15 = testData15.drop(columns=list(testData15.filter(regex='price')))
        testData14 = testData14.drop(columns=list(testData14.filter(regex='price')))
        testData13 = testData13.drop(columns=list(testData13.filter(regex='price')))    
    
    #train
    testModel14 = fitXGBR(testData14, targetPredictionOct14Clipped)
    testModel13 = fitXGBR(testData13, targetPredictionOct13Clipped)
    #predict
    pred13 = testModel14.predict(testData15)
    pred14 = testModel13.predict(testData15)
    pred = getFinalPred(pred13, pred14)
    #rmse
    rmse = testSubmission(pred, targetPredictionOct15Clipped)
    print("Cnt=", Count, "/ Price=", Price,"==> RMSE: ", rmse)

In [None]:
#trainPredictAndEvaluateLaggyData(Count = True , Price = True )
#trainPredictAndEvaluateLaggyData(Count = True , Price = False)
#trainPredictAndEvaluateLaggyData(Count = False, Price = True )
#trainPredictAndEvaluateLaggyData(Count = False, Price = False)

#Commented as quite slow to compute, output is:
#Cnt= True  / Price= True  ==> RMSE:  0.9037315880761034
#Cnt= True  / Price= False ==> RMSE:  0.911286786899765
#Cnt= False / Price= True  ==> RMSE:  1.0285608166824878
#Cnt= False / Price= False ==> RMSE:  1.0031688013402924

#Witch prove that both `count` and `price` laggy feature are usefull to the model. 

In [None]:
# --------------------------------------- 
# CROSS VALIDATION STEP 3/3 - Hyper parameters tunning.
# ---------------------------------------

In [None]:
#let's now try to optimize model hyper parameters on this data. For simplicity and speed we will only use testData13 as anyway final result is 85% of it.

In [None]:
#code snippet adapted from https://www.kaggle.com/jayatou/xgbregressor-with-gridsearchcv
#one could/should definitively push grid search for greater effect, time is the limiting factor here.
#regressorModel = XGBRegressor()
#parameters = {'learning_rate': [0.1, 0.2, 0.3],#aka `eta`, default is 0.3
#              'max_depth': [6, 7],# default is 6
#              'min_child_weight': [1,15],# default is 1
#              'subsample': [0.8, 1.0], # default is 1.0
#              'colsample_bytree': [0.8,1.0],# default is 1
#              'colsample_bylevel': [0.8,1.0]}# default is 1
#xgb_grid = GridSearchCV(regressorModel,
#                        parameters,
#                        n_jobs = 5,
#                        verbose=True)
#xgb_grid.fit(crossTrain_AllLaggy_13_final, targetPredictionOct13Clipped)
#
#print(xgb_grid.best_score_)
#print(xgb_grid.best_params_)

#Commented as quite slow to compute, output is:
#Fitting 5 folds for each of 96 candidates, totalling 480 fits
#[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
#[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed: 17.7min
#[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed: 89.3min
#[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed: 228.3min
#[Parallel(n_jobs=5)]: Done 480 out of 480 | elapsed: 253.4min finished
#0.6109913473710371
#{'colsample_bylevel': 0.8, 'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 6, 'min_child_weight': 15, 'subsample': 1.0}

In [None]:
# --------------------------------------- 
# PREDICTION/SUBMISSION STEP 2/2 - Final prediction and submission
# ---------------------------------------

In [None]:
def fitXGBROptimizedParameters(data, target):
    modelTmp = XGBRegressor(learning_rate=0.2, max_depth=6, min_child_weight=15, subsample=1.0, colsample_bytree=1.0, colsample_bylevel=0.8)
    modelTmp.fit(data, target['item_cnt_month'])
    return modelTmp

In [None]:
def onlyKeepBestFeatures(data):
    bestData = data.drop(columns=list(data.filter(regex='item_category_id')))
    bestData = bestData.drop(columns=list(bestData.filter(regex='item_price')))
    bestData = bestData.drop(columns=list(bestData.filter(regex='ID')))
    bestData = bestData.drop(columns=list(bestData.filter(regex='item_id')))
    return bestData

In [None]:
trainBestFeatures_13 = onlyKeepBestFeatures(trainAllLaggy_13)
trainBestFeatures_14 = onlyKeepBestFeatures(trainAllLaggy_14)
trainBestFeatures_15 = onlyKeepBestFeatures(trainAllLaggy_15)

In [None]:
model13_bestFeatures = fitXGBROptimizedParameters(trainBestFeatures_13, targetPredictionNov13Clipped)
model14_bestFeatures = fitXGBROptimizedParameters(trainBestFeatures_14, targetPredictionNov14Clipped)

In [None]:
prediction15From13_bestFeatures = model13_bestFeatures.predict(trainBestFeatures_15)
prediction15From14_bestFeatures = model14_bestFeatures.predict(trainBestFeatures_15)
prediction15From13_bestFeatures_clipped = clipPrediction(prediction15From13_bestFeatures)
prediction15From14_bestFeatures_clipped = clipPrediction(prediction15From14_bestFeatures)

In [None]:
predBestFeatures = getFinalPred(prediction15From13_bestFeatures_clipped,prediction15From14_bestFeatures_clipped)
submission['item_cnt_month'] = predBestFeatures
submission.to_csv("finalPredictionNoID.csv", index= False)
#Score 1.03345.

#I hope you enjoyed the read :)


In [None]:
#---------------------------------------------------------
# ANNEX/BONUS - Let's add more features and try other libs
#---------------------------------------------------------

In [None]:
from xgboost import plot_importance
plt.rcParams["figure.figsize"] = (15, 6)
plot_importance(model13_bestFeatures)
plt.show()

In [None]:
def addMoreExtraLaggyFeatures(trainDataToEnrich, year, monthStop):

    laggyFeature_1month1  = generateExtraLaggyFeature('01' ,year, monthStop-9,monthStop-9)
    laggyFeature_1month2  = generateExtraLaggyFeature('02' ,year, monthStop-8,monthStop-8)
    laggyFeature_1month3  = generateExtraLaggyFeature('03' ,year, monthStop-7,monthStop-7)
    laggyFeature_1month4  = generateExtraLaggyFeature('04' ,year, monthStop-6,monthStop-6)
    laggyFeature_1month5  = generateExtraLaggyFeature('05' ,year, monthStop-5,monthStop-5)
    laggyFeature_1month6  = generateExtraLaggyFeature('06' ,year, monthStop-4,monthStop-4)
    laggyFeature_1month7  = generateExtraLaggyFeature('07' ,year, monthStop-3,monthStop-3)
    laggyFeature_1month8  = generateExtraLaggyFeature('08' ,year, monthStop-2,monthStop-2)
    laggyFeature_1month9  = generateExtraLaggyFeature('09' ,year, monthStop-1,monthStop-1)
    
    trainDataToEnrich = trainDataToEnrich.merge(laggyFeature_1month1, on = 'ID', how='left')
    trainDataToEnrich = trainDataToEnrich.merge(laggyFeature_1month2, on = 'ID', how='left')
    trainDataToEnrich = trainDataToEnrich.merge(laggyFeature_1month3, on = 'ID', how='left')
    trainDataToEnrich = trainDataToEnrich.merge(laggyFeature_1month4, on = 'ID', how='left')
    trainDataToEnrich = trainDataToEnrich.merge(laggyFeature_1month5, on = 'ID', how='left')
    trainDataToEnrich = trainDataToEnrich.merge(laggyFeature_1month6, on = 'ID', how='left')
    trainDataToEnrich = trainDataToEnrich.merge(laggyFeature_1month7, on = 'ID', how='left')
    trainDataToEnrich = trainDataToEnrich.merge(laggyFeature_1month8, on = 'ID', how='left')
    trainDataToEnrich = trainDataToEnrich.merge(laggyFeature_1month9, on = 'ID', how='left')
    
    return trainDataToEnrich

In [None]:
trainAllLaggy_13 = addMoreExtraLaggyFeatures(trainAllLaggy_13,2013,11)
trainAllLaggy_14 = addMoreExtraLaggyFeatures(trainAllLaggy_14,2014,11)
trainAllLaggy_15 = addMoreExtraLaggyFeatures(trainAllLaggy_15,2015,11)

trainBestFeatures_13 = onlyKeepBestFeatures(trainAllLaggy_13)
trainBestFeatures_14 = onlyKeepBestFeatures(trainAllLaggy_14)
trainBestFeatures_15 = onlyKeepBestFeatures(trainAllLaggy_15)


In [None]:
model13_bestFeatures = fitXGBROptimizedParameters(trainBestFeatures_13, targetPredictionNov13Clipped)
model14_bestFeatures = fitXGBROptimizedParameters(trainBestFeatures_14, targetPredictionNov14Clipped)

In [None]:
trainBestFeatures_13.columns

In [None]:
from xgboost import plot_importance
plt.rcParams["figure.figsize"] = (15, 6)
plot_importance(model13_bestFeatures)
plt.show()

In [None]:
prediction15From13_bestFeatures = model13_bestFeatures.predict(trainBestFeatures_15)
prediction15From14_bestFeatures = model14_bestFeatures.predict(trainBestFeatures_15)
prediction15From13_bestFeatures_clipped = clipPrediction(prediction15From13_bestFeatures)
prediction15From14_bestFeatures_clipped = clipPrediction(prediction15From14_bestFeatures)

predBestFeatures = getFinalPred(prediction15From13_bestFeatures_clipped,prediction15From14_bestFeatures_clipped)
submission['item_cnt_month'] = predBestFeatures
submission.to_csv("finalPredictionMoreLaggy.csv", index= False)

In [None]:
#Let's try catboost

In [None]:
from catboost import Pool
from catboost import CatBoostRegressor

def fitCatBoost(data, target):
    modelTmp = CatBoostRegressor(iterations=500, max_ctr_complexity=4, random_seed=0, od_type='Iter', od_wait=25, verbose=50, depth=4)
    modelTmp.fit(data, target['item_cnt_month'])
    return modelTmp

In [None]:
model13_CatBoost = fitCatBoost(trainBestFeatures_13, targetPredictionNov13Clipped)
model14_CatBoost = fitCatBoost(trainBestFeatures_14, targetPredictionNov14Clipped)

In [None]:
prediction15From13_catboost = model13_CatBoost.predict(trainBestFeatures_15)
prediction15From14_catboost = model14_CatBoost.predict(trainBestFeatures_15)
prediction15From13_catboost_clipped = clipPrediction(prediction15From13_catboost)
prediction15From14_catboost_clipped = clipPrediction(prediction15From14_catboost)

predBestFeatures = getFinalPred(prediction15From13_catboost_clipped,prediction15From14_catboost_clipped)
submission['item_cnt_month'] = predBestFeatures
submission.to_csv("finalPredictionCatBoost.csv", index= False)
# Score 1.03543

In [None]:
#Let's try randomforest

In [None]:
from sklearn.ensemble import RandomForestRegressor

def fitRF(data, target):
    modelTmp = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
    modelTmp.fit(data, target['item_cnt_month'])
    return modelTmp

In [None]:
model13_RF = fitRF(trainBestFeatures_13, targetPredictionNov13Clipped)
model14_RF = fitRF(trainBestFeatures_14, targetPredictionNov14Clipped)

prediction15From13_rf = model13_RF.predict(trainBestFeatures_15)
prediction15From14_rf = model14_RF.predict(trainBestFeatures_15)
prediction15From13_rf_clipped = clipPrediction(prediction15From13_rf)
prediction15From14_rf_clipped = clipPrediction(prediction15From14_rf)

predBestFeatures = getFinalPred(prediction15From13_rf_clipped,prediction15From14_rf_clipped)
submission['item_cnt_month'] = predBestFeatures
submission.to_csv("finalPredictionRF.csv", index= False)