## Base Regression

In [14]:
#loading_required_packages

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso,LassoLarsCV
import pickle
from tpot.builtins import StackingEstimator
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

In [15]:
#importing data
raw_df = pd.read_csv('mean_zero_imputed.csv')

#Label encoding categorical features
cat_cols = raw_df.select_dtypes(include='object').columns
cat_indices = []
for col in cat_cols:
    cat_indices.append(raw_df.columns.get_loc(col))
    

    
#Label encoding cat columns
raw_labeled = raw_df
raw_labeled[cat_cols] = raw_df[cat_cols].apply(LabelEncoder().fit_transform)

In [16]:
#helper method
def reg_metrics(y_test,preds):
    rmse = np.sqrt(mean_squared_error(y_test,preds))
    print('RMSE:',rmse)
    return rmse

### Test and train data

In [17]:
with open('test_index_list.pkl', 'rb') as f:
    test_indices = pickle.load(f)

test_df = raw_df.iloc[test_indices]
X_test_df = test_df.drop(['cuid','convert_30','revenue_30'],axis=1)
y_test_df_con = test_df.convert_30
y_test_df_rev = test_df.revenue_30

In [18]:
raw_index = list(raw_df.index)
for x in test_indices:
    raw_index.remove(x)
    
train_df = raw_df.iloc[raw_index]
X_train_df = train_df.drop(['cuid','convert_30','revenue_30'],axis=1)
y_train_df_con = train_df.convert_30
y_train_df_rev = train_df.revenue_30

### Filtering data where revenue is not zero

In [19]:
train_rev = train_df[train_df.revenue_30!=0.0]
X_train_rev = train_rev.drop(['cuid','convert_30','revenue_30'],axis=1)
y_train_rev = train_rev.revenue_30

test_rev = test_df[test_df.revenue_30!=0.0]
X_test_rev = test_rev.drop(['cuid','convert_30','revenue_30'],axis=1)
y_test_rev = test_rev.revenue_30

In [20]:
#log1p of target
y_train_rev_log = np.log1p(y_train_rev)

**Stacking Estimator with LassoLarsCv and LGBMRegressor**

In [21]:
reg_pipe = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=False)),
    LGBMRegressor())

reg_pipe.fit(X_train_rev,y_train_rev_log)
preds_e = reg_pipe.predict(X_test_rev)
preds_e = np.expm1(preds_e)
reg_metrics(y_test_rev,preds_e)



RMSE: 1062.460007194682


1062.460007194682

In [22]:
filename = 'reg_stacked.pkl'
pickle.dump(reg_pipe, open(filename, 'wb'))

loaded_reg = pickle.load(open(filename, 'rb'))

In [23]:
preds_e = loaded_reg.predict(X_test_rev)
preds_e = np.expm1(preds_e)
reg_metrics(y_test_rev,preds_e)

RMSE: 1062.460007194682


1062.460007194682

**Model**

In [24]:
(reg_pipe.get_params)

<bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('stackingestimator',
                 StackingEstimator(estimator=LassoLarsCV(copy_X=True, cv='warn',
                                                         eps=2.220446049250313e-16,
                                                         fit_intercept=True,
                                                         max_iter=500,
                                                         max_n_alphas=1000,
                                                         n_jobs=None,
                                                         normalize=False,
                                                         positive=False,
                                                         precompute='auto',
                                                         verbose=False))),
                ('lgbmregressor',
                 LGBMRegressor(boosting_type='gbdt', class_weight=None,
                               colsample

In [15]:
pred_on_whole = reg_pipe.predict(X_test_df)
pred_on_whole = np.expm1(pred_on_whole)


reg_metrics(y_test_df_rev,pred_on_whole)

RMSE: 523.2583969151068


523.2583969151068

* RMSE of 523.2583 was achieved by stacking LassoLarsCv ans LGBMRegressor
* RMSE was further reduced to 417.94 by replacing revenue with zero where predicted convert_30 is zero