In [1]:
import pandas as pd
import numpy as np


### reading dataset

In [2]:
train = pd.read_csv('https://raw.githubusercontent.com/shrikantnarayankar15/Deamand-Forecasting/master/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/shrikantnarayankar15/Deamand-Forecasting/master/test.csv')

In [3]:
train['week'] = pd.to_datetime(train['week'])
test['week'] = pd.to_datetime(test['week'])

### Dropping null values which was only one

In [4]:
train.dropna(inplace=True)

### Extracting features from week

In [7]:
train['day'] = train['week'].dt.weekday

train['hour'] = train['week'].dt.hour
train['month'] = train['week'].dt.month

train['Year']= train['week'].dt.year%10
train['DayofWeek']=train['week'].dt.day_name()
train['Week']=train['week'].dt.week


# Test data
test['day'] = test['week'].dt.weekday
test['hour'] = test['week'].dt.hour
test['month'] = test['week'].dt.month

test['Year']= test['week'].dt.year%10
test['DayofWeek']=test['week'].dt.day_name()
test['Week']=test['week'].dt.week

In [8]:
train["DayCount"] = train["week"].apply(lambda m: m.toordinal()/730000) 
test["DayCount"] = test["week"].apply(lambda m: m.toordinal()/730000)

### Outlier removal

In [9]:
train = train.drop(train[train['units_sold']>2500].index)

### Feature Engineering

In [10]:
train['diff'] = (train['base_price'] - train['total_price'])/train['base_price']

In [11]:
# I only DayCount feature to differentiate between each week
features = [ 'store_id', 'sku_id','total_price','base_price',
       'is_featured_sku', 'is_display_sku','diff',
       'DayCount'] 

### Log Conversion of baseprice and total_price due to skewness

In [12]:
train['total_price'] = np.log(train['total_price'])
test['total_price'] = np.log(test['total_price'])

### preparing dataset to train

In [13]:
from sklearn.model_selection import train_test_split,KFold
train.reset_index(inplace=True)
X = train[features]
y = train['units_sold']
y = np.log(y)

### Importing models (Xgboost and lightgbm)

In [14]:
import lightgbm as lgb

hyper_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l2', 'auc'],
    'learning_rate': 0.14,
    "max_depth": 10,
    "n_estimators": 1260,
    "subsample":0.8,
    "min_child_weight":5,
    "colsample_bytree":0.8,
    "eta":0.14,
    "alpha":0.05,
    
}

gbm = lgb.LGBMRegressor(**hyper_params)

In [15]:
from xgboost import XGBRegressor
xgb = XGBRegressor(
    max_depth=8,
    booster = "gbtree",
    n_estimators=1300,
    alpha=0.1,
    colsample_bytree=0.8,
    subsample=0.8,
    eta='0.3',
    min_child_weight=5, 
    metric=['l2','l1'],
    seed=42,tree_method='gpu_hist', gpu_id=0,
lambda_l2=0.01,early_stopping_rounds = 100)
# gbm.fit(X, y)
# xgb.fit(X,y)

#### I used the BaggingRegressor to boost the score

In [16]:
from sklearn.ensemble import BaggingRegressor

bag_xgb = BaggingRegressor(xgb, random_state=0,n_estimators=4)

bag_gbm = BaggingRegressor(gbm, random_state=0)

### Training the data

In [None]:
bag_gbm.fit(X,y)
bag_xgb.fit(X,y)

### Submission of file (Took ceil values of each output for both xgboost and lightgbm)

In [None]:
sample = pd.read_csv('https://raw.githubusercontent.com/shrikantnarayankar15/Deamand-Forecasting/master/sample.csv')
sample['units_sold'] = np.ceil((np.ceil(np.exp((bag_xgb.predict(test[features]))))+np.ceil(np.exp(bag_gbm.predict(test[features]))))/2)
sample.loc[sample['units_sold']<0,'units_sold'] = abs(sample.loc[sample['units_sold']<0,'units_sold'])
sample.to_csv('submit.csv',index=False)