In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler,StandardScaler,OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error,auc

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/kaggle/input/the-great-indian-hiring-hackathon/Participants_Data_TGIH/Train.csv')
test = pd.read_csv('/kaggle/input/the-great-indian-hiring-hackathon/Participants_Data_TGIH/Test.csv')
submission = pd.read_csv('/kaggle/input/the-great-indian-hiring-hackathon/Participants_Data_TGIH/Sample Submission.csv')

# Removing the outliers

In the data exploration phase it has been found that there are some outliers present in the `Quantity` column so removing those outliers.

In [None]:
IQR = train['Quantity'].quantile(0.75) - train['Quantity'].quantile(0.25)

lower = -10000
upper = 6000

outliers = np.where(train['Quantity'] > upper , True , np.where(train['Quantity'] < lower , True, False))

train = train.loc[~(outliers),]

train.drop_duplicates(inplace=True)

In [None]:
train_df = train.drop('UnitPrice',axis=1)
y= train['UnitPrice']

In [None]:
train_idx = len(train)

# Combining train and test dataset

In [None]:
train_df['is_train'] = 1
test['is_train'] = 0

df_combine = pd.concat([train_df,test],axis=0,ignore_index=True)
print(df_combine.head())

df_combine['CustomerID'] = df_combine['CustomerID'].astype('int64')

# Feature Engineering

In [None]:
# Adding a feature named customer invoice id
df_combine['customer_invoice_id'] = df_combine['CustomerID'].astype(str)+"_"+df_combine['InvoiceNo'].astype(str)

#Adding a feature named total visited - total number of transactions done by the customer
total_visited = df_combine.groupby(by=['CustomerID'],as_index=False)['InvoiceNo'].count()
df_combine['visit_count'] = 0

for idx,count in zip(total_visited.CustomerID,total_visited.InvoiceNo):
    cus_idx = df_combine[df_combine['CustomerID']==idx]['visit_count'].index
    df_combine['visit_count'].loc[cus_idx] = count

# Based on the feature visit count we will create a new column frequent - if he visits store more than 52times which is weekly once in a full year
df_combine['is_frequent'] = np.where(df_combine['visit_count']>=52,1,0)
# if visited more than 52 times frequent customer 
# if visited less than 52 times non frequent customer


# Adding a feature number_of_times_ordered - where it contains the number of orders 
# totally placed for that stock
order_count = df_combine.groupby(by=['StockCode'],as_index=False)['InvoiceNo'].count()
df_combine['stock_order_count'] = 0

for idx,count in zip(order_count.StockCode,order_count.InvoiceNo):
    stk_idx = df_combine[df_combine['StockCode']==idx]['stock_order_count'].index
    df_combine['stock_order_count'].loc[stk_idx] = count
    

    
df_combine.head()

Adding new features based on the `InvoiceDate`

In [None]:
# Adding new features using the InvoiceDate column

df_combine['InvoiceDate'] = pd.to_datetime(df_combine.InvoiceDate, format='%Y-%m-%d %H:%M:%S')

df_combine['day'] = df_combine['InvoiceDate'].dt.day
df_combine['month'] = df_combine['InvoiceDate'].dt.month_name()
df_combine['year'] = df_combine['InvoiceDate'].dt.year
df_combine['week'] = df_combine['InvoiceDate'].dt.week
df_combine['day_of_week'] = df_combine['InvoiceDate'].dt.dayofweek
df_combine['is_working_day'] = np.where(df_combine['day_of_week'].isin([0,1,2,3,4,6]),1,0)
df_combine['year_month'] = df_combine['year'].astype(str)+"_"+df_combine['month']
df_combine['is_month_start'] = df_combine['InvoiceDate'].dt.is_month_start
df_combine['is_month_end'] = df_combine['InvoiceDate'].dt.is_month_end
df_combine['quarter'] = df_combine['InvoiceDate'].dt.quarter
df_combine['is_year_start'] = df_combine['InvoiceDate'].dt.is_year_start
df_combine['is_year_end'] = df_combine['InvoiceDate'].dt.is_year_end

#Extracting time features
df_combine['hour'] = df_combine['InvoiceDate'].dt.hour


df_combine = df_combine.drop('InvoiceDate',axis=1)

In [None]:
df_combine.drop('is_train',axis=1,inplace=True)

## OneHotEncoding

In [None]:
ohe = OneHotEncoder()
cols = df_combine.columns.values

df_combine_ohe = ohe.fit_transform(df_combine[cols])

In [None]:
train_df = df_combine_ohe[:train_idx]
test_df = df_combine_ohe[train_idx:]

In [None]:
X = train_df

In [None]:
def rmse(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

# Model building

Below are the different base models which will be trained seperately on the whole training dataset.

In [None]:
lasso = Lasso(alpha=0.0005, random_state = 1,max_iter=100)

ENet = ElasticNet(alpha = 0.0005, l1_ratio=0.9, random_state = 3,max_iter=100)

GBoost = GradientBoostingRegressor(n_estimators = 100,learning_rate=0.05,
                                   max_depth = 10, random_state=5)

model_rf = RandomForestRegressor(max_depth=17,n_estimators=100)

In [None]:
models = [lasso,ENet,GBoost,model_rf]
scores={}
for model in models:
    print(model)
    model.fit(X,y)
    tr_pred = model.predict(X)
    scores[model] = rmse(y,tr_pred)
    print(scores[model])

# Stacked Regressor

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=3):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [None]:
y_train = y.reset_index(drop=True)

In [None]:

stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, model_rf),
                                                 meta_model = lasso)


stacked_averaged_models.fit(X, y_train)

stacked_train_pred = stacked_averaged_models.predict(X)
stacked_pred = np.expm1(stacked_averaged_models.predict(test_df))
print(rmse(y_train, stacked_train_pred))

# Predicting for test data

In [None]:
stacked_pred_ = stacked_averaged_models.predict(test_df)
stacked_pred_

In [None]:
submission['UnitPrice'] = stacked_pred_.round(2)
# scored 23.09
submission.to_csv('avg_model_sub.csv',index=False)

> The above submission scored 23.09 in the public leaderboard

Stacked Regression algorithm refernce - https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

### **If you find this notebook helpful please upvote**