In [26]:
### Import required libraries

import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from bayes_opt import BayesianOptimization
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read train and test files
train_df = pd.read_csv('new_data/2016_to_2018_train_data.csv',encoding = "ISO-8859-1")
test_df = pd.read_csv('new_data/2019_test_data.csv',encoding = "ISO-8859-1")
train_df=train_df.rename(columns = {'Offer(Net Weight)':'Offer_Net_Weight'})
test_df=test_df.rename(columns = {'Offer(Net Weight)':'Offer_Net_Weight'})
train_df = train_df[train_df['Tealingo Item'].notna()]
test_df = test_df[test_df['Tealingo Item'].notna()]
test_df_ori = test_df

In [3]:
train_df['Offer_Net_Weight'] = train_df['Offer_Net_Weight'].str.replace(',', '')
test_df['Offer_Net_Weight'] = test_df['Offer_Net_Weight'].str.replace(',', '')
train_df['Manufacturing Date'] = train_df['Manufacturing Date'].str.replace('-', '.')
test_df['Manufacturing Date'] = test_df['Manufacturing Date'].str.replace('-', '.')

In [4]:
train_df = train_df[['Garden Mark','Garden Geo Orgin','Manufacturing Date','Tealingo Item','Offer_Net_Weight']]
train_df.head()

Unnamed: 0,Garden Mark,Garden Geo Orgin,Manufacturing Date,Tealingo Item,Offer_Net_Weight
0,ASHOK KUMAR ESTATE,SI,25.12.2015,EG36SI,231
1,ASHOK KUMAR ESTATE,SI,25.12.2015,DG36SI,231
2,ASHOK KUMAR ESTATE,SI,25.12.2015,CG36SI,231
3,HIGHFIELD ESTATE,SI,16.12.2015,EG33SI,450
4,ASHOK KUMAR SUPREME,SI,25.12.2015,EG35SI,204


In [5]:
train_df['Manufacturing Date'] = pd.to_datetime(train_df['Manufacturing Date'], errors='coerce')
#train_df['Manufacturing Date'] = train_df['Manufacturing Date'].dt.year
#train_df['Manufacturing Date'] = train_df['Manufacturing Date'].dt.month

train_df['year'] = pd.DatetimeIndex(train_df['Manufacturing Date']).year
train_df['month'] = pd.DatetimeIndex(train_df['Manufacturing Date']).month

In [6]:
train_df['Manufacturing Date'] = train_df['month'].astype(str) + '.' + train_df['year'].astype(str) 

train_df.drop(['year', 'month'], axis=1)
train_df.head()

Unnamed: 0,Garden Mark,Garden Geo Orgin,Manufacturing Date,Tealingo Item,Offer_Net_Weight,year,month
0,ASHOK KUMAR ESTATE,SI,12.2015,EG36SI,231,2015,12
1,ASHOK KUMAR ESTATE,SI,12.2015,DG36SI,231,2015,12
2,ASHOK KUMAR ESTATE,SI,12.2015,CG36SI,231,2015,12
3,HIGHFIELD ESTATE,SI,12.2015,EG33SI,450,2015,12
4,ASHOK KUMAR SUPREME,SI,12.2015,EG35SI,204,2015,12


In [7]:
test_df = test_df[['Garden Mark','Garden Geo Orgin','Manufacturing Date','Tealingo Item','Offer_Net_Weight']]
test_df['Manufacturing Date'] = pd.to_datetime(test_df['Manufacturing Date'], errors='coerce')

test_df['year'] = pd.DatetimeIndex(test_df['Manufacturing Date']).year
test_df['month'] = pd.DatetimeIndex(test_df['Manufacturing Date']).month

test_df['Manufacturing Date'] = test_df['month'].astype(str) + '.' + test_df['year'].astype(str) 

test_df = test_df[['Garden Mark','Garden Geo Orgin','Manufacturing Date','Tealingo Item','Offer_Net_Weight']] 
test_df.head()

Unnamed: 0,Garden Mark,Garden Geo Orgin,Manufacturing Date,Tealingo Item,Offer_Net_Weight
0,ASHOK KUMAR ESTATE,SI,12.2018,EG35SI,238
1,ASHOK KUMAR ESTATE,SI,12.2018,EG35SI,238
2,ASHOK KUMAR ESTATE,SI,12.2018,DG35SI,238
3,ASHOK KUMAR ESTATE,SI,12.2018,CG35SI,238
4,ASHOK KUMAR ESTATE,SI,12.2018,CG35SI,238


In [8]:
#### Check if there are any NULL values in Test Data
print("Total Train Features with NaN Values = " + str(train_df.columns[train_df.isnull().sum() != 0].size))
if (train_df.columns[train_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(train_df.columns[train_df.isnull().sum() != 0])))
    train_df[train_df.columns[train_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Train Features with NaN Values = 0


In [9]:
#### Check if there are any NULL values in Test Data
print("Total Test Features with NaN Values = " + str(test_df.columns[test_df.isnull().sum() != 0].size))
if (test_df.columns[test_df.isnull().sum() != 0].size):
    print("Features with NaN => {}".format(list(test_df.columns[test_df.isnull().sum() != 0])))
    test_df[test_df.columns[test_df.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Test Features with NaN Values = 0


In [11]:
train_df = train_df[['Garden Mark','Garden Geo Orgin','Manufacturing Date','Tealingo Item','Offer_Net_Weight']]
X = train_df.drop('Offer_Net_Weight', axis=1)
y = train_df.Offer_Net_Weight

In [12]:
y = y.astype(float)

In [13]:
y.dtype

dtype('float64')

In [19]:
for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X[c] = X[c].astype('category')

In [20]:
#from sklearn.preprocessing import LabelEncoder
#for c in X.columns:
    #X[c] = LabelEncoder().fit_transform(X[c].values)

In [21]:
dtrain = lgb.Dataset(data=X, label=y)

In [22]:
def lgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds), True

In [23]:
# Objective Function
def hyp_lgbm(num_leaves, feature_fraction, bagging_fraction, max_depth, min_split_gain, min_child_weight):
      
        params = {'application':'regression','num_iterations': 200,
                  'learning_rate':0.05, 'early_stopping_round':50,
                  'metric':'lgb_r2_score'} # Default parameters
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_results = lgb.cv(params, dtrain, nfold=5, seed=101,categorical_feature=[], stratified=False,
                            verbose_eval =None, feval=lgb_r2_score)
        # print(cv_results)
        return np.max(cv_results['r2-mean'])

In [24]:
# Domain space-- Range of hyperparameters 
pds = {'num_leaves': (80, 100),
          'feature_fraction': (0.1, 0.9),
          'bagging_fraction': (0.8, 1),
          'max_depth': (17, 25),
          'min_split_gain': (0.001, 0.1),
          'min_child_weight': (10, 25)
          }

In [27]:
# Surrogate model
optimizer = BayesianOptimization(hyp_lgbm, pds, random_state=77)
                                  
# Optimize
optimizer.maximize(init_points=5, n_iter=15)

|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.4992  [0m | [0m 0.9838  [0m | [0m 0.6138  [0m | [0m 23.03   [0m | [0m 12.09   [0m | [0m 0.009645[0m | [0m 95.76   [0m |
| [0m 2       [0m | [0m 0.4986  [0m | [0m 0.8652  [0m | [0m 0.5329  [0m | [0m 18.92   [0m | [0m 18.18   [0m | [0m 0.04065 [0m | [0m 94.3    [0m |
| [0m 3       [0m | [0m 0.4976  [0m | [0m 0.9673  [0m | [0m 0.5708  [0m | [0m 19.37   [0m | [0m 14.22   [0m | [0m 0.07085 [0m | [0m 88.45   [0m |
| [95m 4       [0m | [95m 0.5211  [0m | [95m 0.8115  [0m | [95m 0.6976  [0m | [95m 20.62   [0m | [95m 12.64   [0m | [95m 0.005888[0m | [95m 85.85   [0m |
| [0m 5       [0m | [0m 0.5197  [0m | [0m 0.8134  [0m | [0m 0.7009  [0m | [0m 17.51   [0m | [0m 16.48   [0m | [0m 0.03705 [0m | [0m 8

In [28]:
optimizer.max

{'target': 0.5286821631103102,
 'params': {'bagging_fraction': 0.8,
  'feature_fraction': 0.9,
  'max_depth': 25.0,
  'min_child_weight': 25.0,
  'min_split_gain': 0.001,
  'num_leaves': 100.0}}

In [78]:
for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X[c] = X[c].astype('category')

In [79]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3683134 entries, 0 to 3971158
Data columns (total 4 columns):
 #   Column              Dtype   
---  ------              -----   
 0   Garden Mark         category
 1   Garden Geo Orgin    category
 2   Manufacturing Date  category
 3   Tealingo Item       category
dtypes: category(4)
memory usage: 49.5 MB


In [80]:
X.columns

Index(['Garden Mark', 'Garden Geo Orgin', 'Manufacturing Date',
       'Tealingo Item'],
      dtype='object')

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=42)

In [82]:
tr_data = lgb.Dataset(X_train, label=y_train, categorical_feature=X.columns.tolist())
va_data = lgb.Dataset(X_test, label=y_test, categorical_feature=X.columns.tolist(), reference=tr_data)

In [83]:
X.columns.tolist()

['Garden Mark', 'Garden Geo Orgin', 'Manufacturing Date', 'Tealingo Item']

In [84]:
y.dtype

dtype('float64')

In [85]:
va_data.data.head()

Unnamed: 0,Garden Mark,Garden Geo Orgin,Manufacturing Date,Tealingo Item
2839720,ROCKVALLEY,SI,3.2018,GG45SI
1033742,DOORIA,AS,5.2016,FE15AS
1598888,GELAKEYBARI,AS,5.2017,IB26AS
353511,GARSWOOD GOLD,SI,6.2016,DG45SI
1454251,BHANDIGURI ESTEEM,CD,12.2016,JG35CD-S


In [86]:
tr_data.data.head()

Unnamed: 0,Garden Mark,Garden Geo Orgin,Manufacturing Date,Tealingo Item
1115848,TEOK,AS,10.2016,DG24AS-S
2200376,UMATARA,AS,7.2017,DB15AS-S
85546,VENKATESWARA SUPREME,SI,2.2016,IB56SI
2660705,MALPANI,AS,5.2017,HE34AS
3539192,SALKATHONI(CL),AS,8.2018,HE24AS


In [87]:
# Train the model
parameters = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 50
}




In [88]:
model = lgb.train(parameters,tr_data,valid_sets=va_data,num_boost_round=100000,early_stopping_rounds=500,verbose_eval=50)

Training until validation scores don't improve for 500 rounds
[50]	valid_0's rmse: 246.253
[100]	valid_0's rmse: 231.889
[150]	valid_0's rmse: 226.943
[200]	valid_0's rmse: 224.207
[250]	valid_0's rmse: 222.268
[300]	valid_0's rmse: 220.596
[350]	valid_0's rmse: 219.421
[400]	valid_0's rmse: 218.265
[450]	valid_0's rmse: 217.242
[500]	valid_0's rmse: 216.286
[550]	valid_0's rmse: 215.566
[600]	valid_0's rmse: 214.979
[650]	valid_0's rmse: 214.395
[700]	valid_0's rmse: 213.88
[750]	valid_0's rmse: 213.427
[800]	valid_0's rmse: 213.083
[850]	valid_0's rmse: 212.634
[900]	valid_0's rmse: 212.279
[950]	valid_0's rmse: 211.923
[1000]	valid_0's rmse: 211.587
[1050]	valid_0's rmse: 211.317
[1100]	valid_0's rmse: 211.065
[1150]	valid_0's rmse: 210.76
[1200]	valid_0's rmse: 210.502
[1250]	valid_0's rmse: 210.25
[1300]	valid_0's rmse: 209.998
[1350]	valid_0's rmse: 209.755
[1400]	valid_0's rmse: 209.535
[1450]	valid_0's rmse: 209.32
[1500]	valid_0's rmse: 209.088
[1550]	valid_0's rmse: 208.894
[

[13150]	valid_0's rmse: 194.895
[13200]	valid_0's rmse: 194.871
[13250]	valid_0's rmse: 194.849
[13300]	valid_0's rmse: 194.829
[13350]	valid_0's rmse: 194.815
[13400]	valid_0's rmse: 194.796
[13450]	valid_0's rmse: 194.776
[13500]	valid_0's rmse: 194.755
[13550]	valid_0's rmse: 194.732
[13600]	valid_0's rmse: 194.707
[13650]	valid_0's rmse: 194.695
[13700]	valid_0's rmse: 194.67
[13750]	valid_0's rmse: 194.657
[13800]	valid_0's rmse: 194.637
[13850]	valid_0's rmse: 194.622
[13900]	valid_0's rmse: 194.601
[13950]	valid_0's rmse: 194.583
[14000]	valid_0's rmse: 194.562
[14050]	valid_0's rmse: 194.551
[14100]	valid_0's rmse: 194.522
[14150]	valid_0's rmse: 194.496
[14200]	valid_0's rmse: 194.482
[14250]	valid_0's rmse: 194.464
[14300]	valid_0's rmse: 194.445
[14350]	valid_0's rmse: 194.424
[14400]	valid_0's rmse: 194.41
[14450]	valid_0's rmse: 194.386
[14500]	valid_0's rmse: 194.375
[14550]	valid_0's rmse: 194.353
[14600]	valid_0's rmse: 194.335
[14650]	valid_0's rmse: 194.313
[14700]	va

[26000]	valid_0's rmse: 192.125
[26050]	valid_0's rmse: 192.122
[26100]	valid_0's rmse: 192.11
[26150]	valid_0's rmse: 192.107
[26200]	valid_0's rmse: 192.107
[26250]	valid_0's rmse: 192.108
[26300]	valid_0's rmse: 192.099
[26350]	valid_0's rmse: 192.088
[26400]	valid_0's rmse: 192.081
[26450]	valid_0's rmse: 192.077
[26500]	valid_0's rmse: 192.073
[26550]	valid_0's rmse: 192.079
[26600]	valid_0's rmse: 192.071
[26650]	valid_0's rmse: 192.067
[26700]	valid_0's rmse: 192.065
[26750]	valid_0's rmse: 192.063
[26800]	valid_0's rmse: 192.052
[26850]	valid_0's rmse: 192.046
[26900]	valid_0's rmse: 192.038
[26950]	valid_0's rmse: 192.033
[27000]	valid_0's rmse: 192.03
[27050]	valid_0's rmse: 192.03
[27100]	valid_0's rmse: 192.023
[27150]	valid_0's rmse: 192.018
[27200]	valid_0's rmse: 192.015
[27250]	valid_0's rmse: 192.004
[27300]	valid_0's rmse: 192.004
[27350]	valid_0's rmse: 192.006
[27400]	valid_0's rmse: 192
[27450]	valid_0's rmse: 191.995
[27500]	valid_0's rmse: 191.99
[27550]	valid_0'

[38900]	valid_0's rmse: 191.444
[38950]	valid_0's rmse: 191.446
[39000]	valid_0's rmse: 191.454
[39050]	valid_0's rmse: 191.453
Early stopping, best iteration is:
[38556]	valid_0's rmse: 191.442


In [89]:
test_data = test_df.drop('Offer_Net_Weight', axis=1)
test_data.head()

Unnamed: 0,Garden Mark,Garden Geo Orgin,Manufacturing Date,Tealingo Item
0,ASHOK KUMAR ESTATE,SI,12.2018,EG35SI
1,ASHOK KUMAR ESTATE,SI,12.2018,EG35SI
2,ASHOK KUMAR ESTATE,SI,12.2018,DG35SI
3,ASHOK KUMAR ESTATE,SI,12.2018,CG35SI
4,ASHOK KUMAR ESTATE,SI,12.2018,CG35SI


In [90]:
for c in test_data.columns:
    col_type = test_data[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        test_data[c] = test_data[c].astype('category')
        

test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1357305 entries, 0 to 1528590
Data columns (total 4 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   Garden Mark         1357305 non-null  category
 1   Garden Geo Orgin    1357305 non-null  category
 2   Manufacturing Date  1357305 non-null  category
 3   Tealingo Item       1357305 non-null  category
dtypes: category(4)
memory usage: 18.4 MB


In [None]:
predictions = model.predict(test_data)

In [None]:
test_df_ori["Predicted_Offer_Net_Weight"] = predictions

In [None]:
test_df_ori.to_csv('output/lightGBM_2019_prediction.csv')

In [None]:
train_df = pd.read_csv('output/lightGBM_2019_prediction.csv',encoding = "ISO-8859-1")

In [None]:
train_df.tail()