In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='dark')
import plotly.offline as py
from plotly import tools
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
le = LabelEncoder()

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score
import os

In [None]:
#read  files 
Train=pd.read_csv('../input/machinehack-buyers-time-prediction-challenge/ParticipantData_BTPC/Train.csv')
Test=pd.read_csv('../input/machinehack-buyers-time-prediction-challenge/ParticipantData_BTPC/Test.csv')
Sample=pd.read_csv('../input/machinehack-buyers-time-prediction-challenge/ParticipantData_BTPC/Sample Submission.csv')

In [None]:
#pd.set_option('display.max_colwidth', None) #to see full column values #

In [None]:
pd.concat((Train.nunique(), Test.nunique()), axis = 1)

In [None]:
Train.head().T

In [None]:
Test.head().T

In [None]:
Sample.head().T

In [None]:
Train.info()

In [None]:
Test.info()

In [None]:
print(Train.isnull().any())

In [None]:
print(Train.client_agent.isnull().sum())

only client agent has null value 

In [None]:
print(Test.isnull().any())

In [None]:
print(Test.client_agent.isnull().sum())

Total 160 values are mising in train datset for client agest feature and same in test it has 59 missing values 

This shows client side software details , so we will fill it as unknown for both in train and test datset later . currently I will do exploratory data analysis on this. 

**Data description of our data set **

session_id - Unique identifier for every row

session_number - Session type identifier

client_agent - Client-side software details

device_details -  Client-side device details

date - Datestamp of the session

purchased - Binary value for any purchase done

added_in_cart - Binary value for cart activity

checked_out -  Binary value for checking out successfully

time_spent - Total time spent in seconds (Target Column)
Regression Modeling
Advance Feature engineering, with Datestamp and Text datatypes
Optimizing RMSLE score as a metric to generalize well on unseen data

# Preprocessing
add both train and test for model creation , create dummy column for time pent in test 

In [None]:
#convert time spent to log transform
Train['time_spent']= np.log1p(Train['time_spent'])


In [None]:
Train['time_spent'].head()

In [None]:
Train['time_spent'].plot()

In [None]:
df = pd.concat([Train, Test], axis = 0).reset_index(drop=True)
df.shape

In [None]:
df['purchased']=df['purchased'].astype('bool')
df['added_in_cart']=df['added_in_cart'].astype('bool')
df['checked_out']=df['checked_out'].astype('bool')
df['date']=pd.to_datetime(df['date'])
df.head().T

# Feature Engineering

**1. Extracting Features**

In [None]:
#cleaning the text and extracting product details starting and ending sequence
df['client_agent']=df['client_agent'].replace(' ',',',regex=True)
df['client_agent']=df['client_agent'].str.replace(r"\s*\([^()]*\)","").str.strip()
df['client_agent']=df['client_agent'].str.strip('[]')
df['client_agent']=df['client_agent'].replace('/', '',regex=True)
df['client_agent']=df['client_agent'].replace(':', '',regex=True)
df['client_agent']=df['client_agent'].replace(';',',',regex=True)
df['client_agent']=df['client_agent'].replace(',,',',',regex=True)
df['client_agent']=df['client_agent'].str.lower()
df['client_agent'].fillna('0', inplace=True)
df['client_agent'].isnull().any()

In [None]:
df['SW']=df['client_agent'].apply(lambda x : x.split(',')[0])
df['product']=df['client_agent'].apply(lambda x : x.split(',')[-1])

In [None]:
df['device']= df['device_details'].apply(lambda x : x.split('-')[0])
df['Browser']= df['device_details'].apply(lambda x : x.split('-')[-1])

In [None]:
df = df.assign(
    year        = lambda df: df['date'].dt.year,
    month       = lambda df: df['date'].dt.month,
    day         = lambda df: df['date'].dt.day,
    weekday     = lambda df: df['date'].dt.dayofweek,
    Weekend_FLG = lambda df: df['weekday'].apply(lambda day: '1' if day in [5,6] else '0'),
    Quater      = lambda df: df['date'].dt.quarter
)

In [None]:
df.isnull().any()

In [None]:
df.head().T

In [None]:
df.info()

**unique vales in data frame**

In [None]:
df.nunique()

# Grouping - features - target encoding 
group by mothly time spent , quarterly time spent 

In [None]:
df['time_spent'].describe()

In [None]:
df['min_timespent_per_month']=df.groupby('month')['time_spent'].transform('min')
df['max_timespent_per_month']=df.groupby('month')['time_spent'].transform('max')
df['mean_timespent_per_month']=df.groupby('month')['time_spent'].transform('mean')
df['median_timespent_per_month']=df.groupby('month')['time_spent'].transform('median')

In [None]:
df['min_timespent_per_Quarter']=df.groupby('Quater')['time_spent'].transform('min')
df['max_timespent_per_Quarter']=df.groupby('Quater')['time_spent'].transform('max')
df['mean_timespent_per_Quarter']=df.groupby('Quater')['time_spent'].transform('mean')
df['median_timespent_per_Quarter']=df.groupby('Quater')['time_spent'].transform('median')


In [None]:
df['min_timespent_per_weekday']=df.groupby('weekday')['time_spent'].transform('min')
df['max_timespent_per_weekday']=df.groupby('weekday')['time_spent'].transform('max')
df['mean_timespent_per_weekday']=df.groupby('weekday')['time_spent'].transform('mean')
df['median_timespent_per_weekday']=df.groupby('weekday')['time_spent'].transform('median')

**now device grouping per time spent**
1. step one check if any device is new in test set ie not in train set 
2. with what i can replace the new value i need to see 

In [None]:
#search unique values in train and test 
Train['device']= Train['device_details'].apply(lambda x : x.split('-')[0])
Train['Browser']= Train['device_details'].apply(lambda x : x.split('-')[-1])
Test['device']= Test['device_details'].apply(lambda x : x.split('-')[0])
Test['Browser']= Test['device_details'].apply(lambda x : x.split('-')[-1])

In [None]:
Train['device'].value_counts()

In [None]:
Test['device'].value_counts()

In [None]:
Train['Browser'].value_counts()

In [None]:
Test['Browser'].value_counts()

**All devices and browser in test are present in train**
gruping time spent per device and browser 

In [None]:
df['min_timespent_per_device']=df.groupby('device')['time_spent'].transform('min')
df['max_timespent_per_device']=df.groupby('device')['time_spent'].transform('max')
df['mean_timespent_per_device']=df.groupby('device')['time_spent'].transform('mean')
df['median_timespent_per_device']=df.groupby('device')['time_spent'].transform('median')

In [None]:
df['min_timespent_per_Browser']=df.groupby('Browser')['time_spent'].transform('min')
df['max_timespent_per_Browser']=df.groupby('Browser')['time_spent'].transform('max')
df['mean_timespent_per_Browser']=df.groupby('Browser')['time_spent'].transform('mean')
df['median_timespent_per_Browser']=df.groupby('Browser')['time_spent'].transform('median')

**now client sw grouping per time spent**

step one check if any sw is new in test set ie not in train set
with what i can replace the new value i need to see

In [None]:
#cleaning the text and extracting product details starting and ending sequence
Train['client_agent']=Train['client_agent'].replace(' ',',',regex=True)
Train['client_agent']=Train['client_agent'].str.replace(r"\s*\([^()]*\)","").str.strip()
Train['client_agent']=Train['client_agent'].str.strip('[]')
Train['client_agent']=Train['client_agent'].replace('/', '',regex=True)
Train['client_agent']=Train['client_agent'].replace(':', '',regex=True)
Train['client_agent']=Train['client_agent'].replace(';',',',regex=True)
Train['client_agent']=Train['client_agent'].replace(',,',',',regex=True)
Train['client_agent']=Train['client_agent'].str.lower()
Train['client_agent'].fillna('0', inplace=True)
Train['client_agent'].isnull().any()

In [None]:
Train['SW']=Train['client_agent'].apply(lambda x : x.split(',')[0])
Train['product']=Train['client_agent'].apply(lambda x : x.split(',')[-1])

In [None]:
#cleaning the text and extracting product details starting and ending sequence
Test['client_agent']=Test['client_agent'].replace(' ',',',regex=True)
Test['client_agent']=Test['client_agent'].str.replace(r"\s*\([^()]*\)","").str.strip()
Test['client_agent']=Test['client_agent'].str.strip('[]')
Test['client_agent']=Test['client_agent'].replace('/', '',regex=True)
Test['client_agent']=Test['client_agent'].replace(':', '',regex=True)
Test['client_agent']=Test['client_agent'].replace(';',',',regex=True)
Test['client_agent']=Test['client_agent'].replace(',,',',',regex=True)
Test['client_agent']=Test['client_agent'].str.lower()
Test['client_agent'].fillna('0', inplace=True)
Test['client_agent'].isnull().any()

In [None]:
Test['SW']=Test['client_agent'].apply(lambda x : x.split(',')[0])
Test['product']=Test['client_agent'].apply(lambda x : x.split(',')[-1])

**which values in test are not in train**

In [None]:
a = np.array(Train['SW'].unique())
b = np.array(Test['SW'].unique())
SW_replace_list = list(np.setdiff1d(b,a))
print(*SW_replace_list)

In [None]:
c = np.array(Train['product'].unique())
d = np.array(Test['product'].unique())
product_replace_list = list(np.setdiff1d(d,c))
print(*product_replace_list)

In [None]:
#Train[Train['SW'].str.contains('product2', regex=False)]
#Train[Train['SW'].str.contains('product3.', regex=False)]
#Train[Train['SW'].str.contains('product3.', regex=False)]

In [None]:
#Train[Train['product'].str.contains('ipad7', regex=False)]
#Train[Train['product'].str.contains('iphone7', regex=False)]
#Train[Train['product'].str.contains('safari534', regex=False)]


In [None]:
#now replace extra test set values with train set values for prodcut
replace_product={'ipad7.1':'ipad7.1.2',
                'ipad7.1.1':'ipad7.1.2',
                'iphone7.0':'iphone7.0.4',
                'safari534.51.22':'safari534.57.2',
                'safari534.52.7':'safari534.57.2'}

#now replace extra test set values with train set values for SW
SW_replace={'product2.5.1':'product3.3.1',
            'product3.2.1':'product3.3.1',
            'product3.4.0':'product3.3.1' }
    

In [None]:
median_timespent_per_SW_dict=Train.groupby('SW')['time_spent'].median().to_dict()
min_timespent_per_SW_dict=Train.groupby('SW')['time_spent'].min().to_dict()
max_timespent_per_SW_dict=Train.groupby('SW')['time_spent'].max().to_dict()

In [None]:

df['med_time_per_SW']=df['SW'].apply(lambda x: median_timespent_per_SW_dict[SW_replace[x]] 
                                     if x in SW_replace_list 
                                     else median_timespent_per_SW_dict[x] )
df['min_time_per_SW']=df['SW'].apply(lambda x: min_timespent_per_SW_dict[SW_replace[x]] 
                                     if x in SW_replace_list 
                                     else min_timespent_per_SW_dict[x] )
df['max_time_per_SW']=df['SW'].apply(lambda x: max_timespent_per_SW_dict[SW_replace[x]] 
                                     if x in SW_replace_list 
                                     else max_timespent_per_SW_dict[x] )

In [None]:
median_timespent_per_product_dict=Train.groupby('product')['time_spent'].median().to_dict()
min_timespent_per_product_dict=Train.groupby('product')['time_spent'].min().to_dict()
max_timespent_per_product_dict=Train.groupby('product')['time_spent'].max().to_dict()

In [None]:

df['med_time_per_product']=df['product'].apply(lambda x: 
                                               median_timespent_per_product_dict[replace_product[x]] 
                                               if x in product_replace_list
                                               else median_timespent_per_product_dict[x] )
df['min_time_per_product']=df['product'].apply(lambda x: 
                                               min_timespent_per_product_dict[replace_product[x]] 
                                               if x in product_replace_list 
                                               else min_timespent_per_product_dict[x] )
df['max_time_per_product']=df['product'].apply(lambda x: 
                                               max_timespent_per_product_dict[replace_product[x]] 
                                               if x in product_replace_list 
                                               else max_timespent_per_product_dict[x] )

**Dropping features not required any more **

In [None]:
df.info()

In [None]:
drop_cols=['session_id','client_agent','device_details','date']
df.drop(drop_cols, axis = 1, inplace=True)

In [None]:
print(df. columns) 

In [None]:
cols_flt=['min_timespent_per_month',
       'max_timespent_per_month', 'mean_timespent_per_month',
       'median_timespent_per_month', 'min_timespent_per_Quarter',
       'max_timespent_per_Quarter', 'mean_timespent_per_Quarter',
       'median_timespent_per_Quarter', 'min_timespent_per_weekday',
       'max_timespent_per_weekday', 'mean_timespent_per_weekday',
       'median_timespent_per_weekday', 'min_timespent_per_device',
       'max_timespent_per_device', 'mean_timespent_per_device',
       'median_timespent_per_device', 'min_timespent_per_Browser',
       'max_timespent_per_Browser', 'mean_timespent_per_Browser',
       'median_timespent_per_Browser', 'med_time_per_SW', 'min_time_per_SW',
       'max_time_per_SW', 'med_time_per_product', 'min_time_per_product',
       'max_time_per_product']
for col in cols_flt:
    df[col] = df[col].astype('float32')

In [None]:
df['Weekend_FLG']=df['Weekend_FLG'].astype('bool')

In [None]:
df[['SW','product']] = df[['SW','product']].apply(le.fit_transform)
df[['device','Browser']] = df[['device','Browser']].apply(le.fit_transform)

In [None]:
train_proc, test_proc = df[:Train.shape[0]], df[Train.shape[0]:].reset_index(drop = True)

In [None]:
target = 'time_spent'
features = [col for col in df.columns if col not in ([target])]

In [None]:
trn, val = train_test_split(train_proc, test_size = 0.2, random_state = 1999)
##### Input for model
X_trn, X_val = trn[features], val[features]
##### Target column
y_trn, y_val = trn[target], val[target]
##### Features for test data that we will be predicting
X_test = test_proc[features]

# model buidling

In [None]:
%%time
lgb = LGBMRegressor(random_state=1999)

lgb.fit(X_trn, y_trn)

preds = lgb.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))
print(f'mean_squared_log_error is : {error}')

In [None]:
%%time
xgb = XGBRegressor()

xgb.fit(X_trn, y_trn)
preds = xgb.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))

print(f'mean_squared_log_error is : {error}')

In [None]:
%%time

rf = RandomForestRegressor(random_state = 1999, n_jobs = -1)

rf.fit(X_trn, y_trn)
preds = rf.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))

print(f'mean_squared_log_error is : {error}')

In [None]:
def cross_val(regressor, train, test, features, name):
    N_splits = 5
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True,random_state = 1999)
    stratified_target = pd.qcut( train[target], 10, labels=False, duplicates='drop')
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n==================Fold{index + 1}=============================')
        
        #### Train Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        
        #### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[target].iloc[val_idx]
        
        #### Test Set
        X_test = test[features]
        
        if name != 'cat':
            #### Scaling Data ####
            scaler = StandardScaler()
            _ = scaler.fit(X_trn)
            X_trn = scaler.transform(X_trn)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
        
        ############ Fitting #############
        _ = regressor.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], 
                          early_stopping_rounds = 50, verbose = False)
        
        ############ Predicting #############
        val_preds = np.abs(regressor.predict(X_val))
        test_preds = np.abs(regressor.predict(X_test))
        error = np.sqrt(mean_squared_error(y_val, val_preds))
        print(f'\n Root Log Mean Squared Error for Validation set is : {error}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    total_error = np.sqrt(mean_squared_error(target_col, oofs))
    print(f'\n\Root Log Mean Squared Error for oofs is {total_error}')
    
    return oofs, preds
        

In [None]:
def normal_cross_val(regressor, train, test, features):
    N_splits = 5
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    target_col = train[target]
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True,random_state = 1999)
    stratified_target = pd.qcut( train[target], 10, labels=False, duplicates='drop')
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n===================Fold{index + 1}=======================')
        #### Train Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        #### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[target].iloc[val_idx]
        #### Test Set
        X_test = test[features]
        #### Scaling Data ####
        scaler = StandardScaler()
        _ = scaler.fit(X_trn)
        
        X_trn = scaler.transform(X_trn)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
        ############ Fitting #############
        _ = regressor.fit(X_trn, y_trn)
        
        ############ Predicting #############
        val_preds = np.abs(regressor.predict(X_val))
        test_preds = np.abs(regressor.predict(X_test))
        
        error = np.sqrt(mean_squared_error(y_val, val_preds))
        print(f'\n Root Log Mean Squared Error for Validation set is : {error}')
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    total_error = np.sqrt(mean_squared_error(target_col, oofs))
    print(f'\n\Root Log Mean Squared Error for oofs is {total_error}')
    
    return oofs, preds

# Predicting With Tuned Models

In [None]:
%%time
rf_oofs, rf_preds = normal_cross_val(rf, train_proc, test_proc, features)

In [None]:
%%time
lgb_oofs, lgb_preds = cross_val(lgb, train_proc, test_proc, features, 'lgb')

In [None]:
%%time
xgb_oofs, xgb_preds = cross_val(xgb, train_proc, test_proc, features, 'xgb')

In [None]:
import optuna
from optuna.samplers import TPESampler

In [None]:
%%time

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 35)
    n_estimators = trial.suggest_int("n_estimators", 700, 1500)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 5)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 100,10000)
    max_features = trial.suggest_uniform('max_features', 0.1, 0.9)
    model = RandomForestRegressor( 
        max_depth=max_depth,
        n_estimators = n_estimators,
        min_samples_split = min_samples_split,
        max_leaf_nodes = max_leaf_nodes,
        max_features = max_features,
        random_state=1999,
        bootstrap = True,
        n_jobs = -1
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=40)

rf_params = study.best_params
rf_params['random_state'] = 1999
rf = RandomForestRegressor(**rf_params)
rf.fit(X_trn, y_trn)
preds = rf.predict(X_val)
print('Optimized RF RMSLE', np.sqrt(mean_squared_error(y_val, preds)))

In [None]:
%%time

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 40)
    n_estimators = trial.suggest_int("n_estimators", 700, 2000)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 1)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 0.9)
    num_leaves = trial.suggest_int("num_leaves", 2, 500)
    #min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    reg_alpha = trial.suggest_uniform("reg_alpha", 0.1, 0.9)
    reg_lambda = trial.suggest_uniform("reg_lambda", 0.1, 0.9)
    model = LGBMRegressor(
        max_depth=max_depth,
        n_estimators = n_estimators,
        learning_rate=learning_rate, 
        colsample_bytree = colsample_bytree,
        num_leaves=num_leaves, 
        reg_alpha = reg_alpha,
        reg_lambda = reg_lambda,
        #min_child_samples=min_child_samples,
        random_state=1999,
        n_jobs = -3
    )
    return model
sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=60)

lgb_params = study.best_params
lgb_params['random_state'] = 1999
lgb = LGBMRegressor(**lgb_params)
lgb.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
preds = lgb.predict(X_val)
print('Optimized LGBM RMSLE', np.sqrt(mean_squared_error(y_val, preds)))

In [None]:
%%time

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 7, 15)
    n_estimators = trial.suggest_int("n_estimators", 500, 1500)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 1)
    subsample = trial.suggest_uniform('subsample', 0.1, 0.99)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 0.9)
    colsample_bylevel = trial.suggest_uniform('colsample_bylevel', 0.1, 0.9)
    #num_leaves = trial.suggest_int("num_leaves", 2, 5000)
    #min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    reg_alpha = trial.suggest_int("reg_alpha", 1, 10)
    reg_lambda = trial.suggest_int("reg_lambda", 1, 10)
    model = XGBRegressor(
        max_depth = max_depth,
        n_estimators = n_estimators,
        learning_rate=learning_rate, 
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        colsample_bylevel = colsample_bylevel,
        #num_leaves=num_leaves, 
        #min_child_samples=min_child_samples,
        random_state=0,
        n_jobs = -3
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn, eval_set = [ (X_val, y_val)], early_stopping_rounds = 50, verbose = False)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=50)

xgb_params = study.best_params
xgb_params['random_state'] = 0
xgb = XGBRegressor(**xgb_params)
xgb.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
preds = xgb.predict(X_val)
print('Optimized XGB RMSLE', np.sqrt(mean_squared_error(y_val, preds)))


In [None]:
#%%time  I will check later 

#models = {'lgb' : lgb, 'rf' : rf, 'xgb' : xgb}

#for name,model in models.items():
    #error = predict(model, name)
    #print(f'Error for {name} is {error}')

# All Tuned Models
LGB

In [None]:
# 1. Manual Tuned
lgb_1 = LGBMRegressor(random_state=1999,n_estimators=1000, learning_rate=0.13,num_leaves=70,max_depth=31,
               reg_lambda=0.3, reg_alpha = 0.7)

In [None]:
# 2. Optuna Tuned
params = {'max_depth': 29, 'n_estimators': 868, 'learning_rate': 0.10130592168165514, 'colsample_bytree': 0.29840872430993026,
          'num_leaves': 338, 'reg_alpha': 0.7919788672424208, 'reg_lambda': 0.5736739628502263}

lgb_2 = LGBMRegressor(**params)

In [None]:
params = {'max_depth': 31, 'n_estimators': 768, 'learning_rate': 0.10395358602462655, 'colsample_bytree': 0.3004582369227073,
          'num_leaves': 332, 'reg_alpha': 0.7969567856974819, 'reg_lambda': 0.6432689443285323}

lgb_3 = LGBMRegressor(**params)

In [None]:
# XGBoost
params = {'max_depth': 9, 'n_estimators': 500,'learning_rate': 0.1, 'booster' : 'gbtree', 'n_jobs' : -1,
         'subsample' : 0.9, 'colsample_bytree' : 0.8, 'colsample_bylevel' : 0.6, 'random_state' : 0}
xgb_1 = XGBRegressor(**params)

In [None]:
params = {'max_depth': 8, 'n_estimators': 1156, 'learning_rate': 0.1282423144462752,
          'subsample': 0.8583044649709827,'colsample_bytree': 0.39430648031413884,
          'colsample_bylevel': 0.439284444843544,'random_state' : 0}
xgb_2 = XGBRegressor(**params)

In [None]:
params = {'max_depth': 9, 'n_estimators': 1329, 'learning_rate': 0.10067225176673156, 
          'subsample': 0.9010792397620144, 'colsample_bytree': 0.4501213056757911, 
          'colsample_bylevel': 0.75993128190449555, 'random_state' : 0}
xgb_3 = XGBRegressor(**params)

In [None]:
# XGBoost
params = {'max_depth': 9, 'n_estimators': 2000,'learning_rate': 0.1, 'booster' : 'gbtree', 'n_jobs' : -1,
         'subsample' : 0.9, 'colsample_bytree' : 0.8, 'colsample_bylevel' : 0.6, 'random_state' : 0}
xgb_4 = XGBRegressor(**params)

In [None]:
params = {'max_depth': 30, 'n_estimators': 2000, 'min_samples_split' : 2,'max_features' : 'sqrt', 'max_leaf_nodes' : 8000,
          'bootstrap' : True,'random_state' : 1999, 'n_jobs' : -1}

rf_1 = RandomForestRegressor(**params)

In [None]:
params = {'max_depth': 31, 'n_estimators': 1317, 'min_samples_split': 2, 'max_leaf_nodes': 6653, 
          'max_features': 0.6297197869507615, 'bootstrap' : True,'random_state' : 1999, 'n_jobs' : -1}

rf_2 = RandomForestRegressor(**params)

In [None]:
bag_1 = BaggingRegressor(base_estimator=lgb_1,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)


In [None]:
bag_2 = BaggingRegressor(base_estimator=lgb_2,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)

In [None]:
bag_3 = BaggingRegressor(base_estimator=xgb_1,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)

In [None]:
bag_4 = BaggingRegressor(base_estimator=xgb_3,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)

In [None]:
bag_5 = BaggingRegressor(base_estimator=xgb_4,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)

# Predicting With All Models

In [None]:
train_new = train_proc[[target]].copy()
test_new = test_proc[[target]].copy()

In [None]:
%%time
lgb_1_oofs, lgb_1_preds = cross_val(lgb_1, train_proc, test_proc, features, 'lgb')

In [None]:
%%time
lgb_2_oofs, lgb_2_preds = cross_val(lgb_2, train_proc, test_proc, features, 'lgb')

In [None]:
%%time
lgb_3_oofs, lgb_3_preds = cross_val(lgb_3, train_proc, test_proc, features, 'lgb')


In [None]:
train_new['lgb_1'] = lgb_1_oofs
test_new['lgb_1'] = lgb_1_preds

train_new['lgb_2'] = lgb_2_oofs
test_new['lgb_2'] = lgb_2_preds

train_new['lgb_3'] = lgb_3_oofs
test_new['lgb_3'] = lgb_3_preds

In [None]:
%%time
xgb_1_oofs, xgb_1_preds = cross_val(xgb_1, train_proc, test_proc, features, 'xgb')


In [None]:
%%time
xgb_2_oofs, xgb_2_preds = cross_val(xgb_2, train_proc, test_proc, features, 'xgb')


In [None]:
%%time
xgb_3_oofs, xgb_3_preds = cross_val(xgb_3, train_proc, test_proc, features, 'xgb')

In [None]:
%%time
xgb_4_oofs, xgb_4_preds = cross_val(xgb_4, train_proc, test_proc, features, 'xgb')


In [None]:
train_new['xgb_1'] = xgb_1_oofs
test_new['xgb_1'] = xgb_1_preds

train_new['xgb_2'] = xgb_2_oofs
test_new['xgb_2'] = xgb_2_preds

train_new['xgb_3'] = xgb_3_oofs
test_new['xgb_3'] = xgb_3_preds

train_new['xgb_4'] = xgb_4_oofs
test_new['xgb_4'] = xgb_4_preds

In [None]:
%%time
rf_1_oofs, rf_1_preds = normal_cross_val(rf_1, train_proc, test_proc, features)

In [None]:
%%time
rf_2_oofs, rf_2_preds = normal_cross_val(rf_2, train_proc, test_proc, features)


In [None]:
train_new['rf_1'] = rf_1_oofs
test_new['rf_1'] = rf_1_preds

train_new['rf_2'] = rf_2_oofs
test_new['rf_2'] = rf_2_preds

In [None]:
%%time
bag_1_oofs, bag_1_preds = normal_cross_val(bag_1, train_proc, test_proc, features)


In [None]:
%%time
bag_2_oofs, bag_2_preds = normal_cross_val(bag_2, train_proc, test_proc, features)


In [None]:
%%time
bag_3_oofs, bag_3_preds = normal_cross_val(bag_3, train_proc, test_proc, features)

In [None]:
%%time
bag_4_oofs, bag_4_preds = normal_cross_val(bag_4, train_proc, test_proc, features)


In [None]:
%%time
bag_5_oofs, bag_5_preds = normal_cross_val(bag_5, train_proc, test_proc, features)


In [None]:
train_new['bag_1'] = bag_1_oofs
test_new['bag_1'] = bag_1_preds

train_new['bag_2'] = bag_2_oofs
test_new['bag_2'] = bag_2_preds

train_new['bag_3'] = bag_3_oofs
test_new['bag_3'] = bag_3_preds

train_new['bag_4'] = bag_4_oofs
test_new['bag_4'] = bag_4_preds

train_new['bag_5'] = bag_5_oofs
test_new['bag_5'] = bag_5_preds

In [None]:
ens_features = [c for c in train_new.columns if c not in [target]]

In [None]:
%%time
level_1_lgb_oofs, level_1_lgb_preds = cross_val(LGBMRegressor(),
                                                train_new, test_new, 
                                                ens_features, 'lgb')


In [None]:
%%time
level_1_xgb_oofs, level_1_xgb_preds = cross_val(xgb_1, train_new, test_new, ens_features, 'xgb')

In [None]:
%%time
level_1_rf_1_oofs, level_1_rf_1_preds = normal_cross_val(rf_1, train_new, test_new, ens_features)


In [None]:
%%time
level_1_lgb_bag_oofs, level_1_lgb_bag_preds =normal_cross_val(BaggingRegressor
                                                              (base_estimator = LGBMRegressor()),
                                                              train_new, test_new, ens_features)

In [None]:
%%time
level_1_xgb_bag_oofs, level_1_xgb_bag_preds = normal_cross_val(BaggingRegressor
                                                               (base_estimator = xgb_1),
                                                               train_new, test_new,
                                                               ens_features)

In [None]:
%%time
params = {'max_depth': 30, 'n_estimators': 1000, 'min_samples_split' : 2,
          'max_features' : 'sqrt', 'max_leaf_nodes' : 8000,
          'bootstrap' : True,'random_state' : 1999, 'n_jobs' : -1}

rf = RandomForestRegressor(**params)

level_1_rf_bag_oofs, level_1_rf_bag_preds = normal_cross_val(BaggingRegressor
                                                             (base_estimator = rf),
                                                             train_new, test_new, 
                                                             ens_features)


In [None]:
ens_train_new = train_proc[[target]].copy()
ens_test_new = test_proc[[target]].copy()

ens_train_new['lgb'] = level_1_lgb_oofs
ens_test_new['lgb'] = level_1_lgb_preds

ens_train_new['xgb'] = level_1_xgb_oofs
ens_test_new['xgb'] = level_1_xgb_preds

ens_train_new['rf'] = level_1_rf_1_oofs
ens_test_new['rf'] = level_1_rf_1_preds

ens_train_new['lgb_bag'] = level_1_lgb_bag_oofs
ens_test_new['lgb_bag'] = level_1_lgb_bag_preds

ens_train_new['rf1_bag'] = level_1_rf_bag_oofs
ens_test_new['rf1_bag'] = level_1_rf_bag_preds

ens_train_new['xgb_bag'] = level_1_xgb_bag_oofs
ens_test_new['xgb_bag'] = level_1_xgb_bag_preds


In [None]:
ens_lvl_2_features = [c for c in ens_train_new.columns if c not in [target]]


leve 2 stacking 

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,ARDRegression,SGDRegressor
clf = LinearRegression()

ens_linear_oofs, ens_linear_preds = normal_cross_val(clf,
                                                     ens_train_new, ens_test_new, ens_lvl_2_features)


In [None]:
clf = Ridge()

ens_ridge_oofs, ens_ridge_preds = normal_cross_val(clf,
                                                   ens_train_new, ens_test_new, ens_lvl_2_features)


In [None]:
clf = ARDRegression(normalize = True)

ens_ARD_oofs, ens_ARD_preds = normal_cross_val(clf,
                                               ens_train_new, ens_test_new, ens_lvl_2_features)


In [None]:
preds = ens_linear_preds*0.60 + ens_ridge_preds*0.20 + ens_ARD_preds*0.20

#sample_sub['time_spent']=np.abs((np.exp(preds)-1))

#sample_sub.to_csv('/kaggle/working/Submission.csv', index=False)
#sample_sub.to_csv(path + '\\Stacking.csv', index = False)

In [None]:
sample_sub=pd.DataFrame(columns=['time_spent'])

In [None]:
sample_sub['time_spent']=np.abs((np.exp(preds)-1))

sample_sub.to_csv('/kaggle/working/Submission.csv', index=False)


product2.5.1 product3.2.1 product3.4.0
train these are iphone 

product3.3.1- 3.4 last one 


ipad7.1 ipad7.1.1 iphone7.0 safari534.51.22 safari534.52.7----  product 
now device is iphone for first 3 replace thenm with some iphone 7 model present in train 

safari which device model need to see 
safari537.36 
safari537.73.11 - two are in train data 

534- is desktop mozilaa5.0 safari browser same desk top safari 

train data for ipad7 
product4.0.0	ipad7.1.2- train data 
product4.2.0	iphone7.1.1
product4.2.0	iphone7.0.4

safari534.57.2 train 4



info on time spent median to mean vale 
4.656414	product150264 Android Phone - Android  649	

7.64116	4.06065	6.66716	- these are time spent values for product150264
its product is typemobile
Android Phone	Android 4th one 	


In [None]:
#sns.heatmap(df.corr(), annot=True, cmap='RdYlGn')

1. convert to boolean
2. convert text to number - device details 
3. like adress and city - find train and test differences 
4. client agent group by time spent for train test 
6. group by device details 
5. gruop by quarter / month / day of week 

need to check if they need to be done on train and test separate ly 
what is my train set 
what is my test set 
how it has been done in address tab 
how test time spent is filled  
 



2. train test - need to extract devide details - check any values of test absent from train 
3. then group by men median min max wrt time spent 
4. map those who are new in test from train values 
5. convert these values to lebel transform. 

1. train test - need to extract client details - check any values of test absent from train
3. then group by men median min max wrt time spent 
4. map those who are new in test from train values 
5. convert these values to lebel transform. 

10. group by month and quarter 

9. change the data types accordingly - groups to float 
6. delete the columns not required 

In [None]:
#Train=Train.sort_values(by="date")
#Train.date.head().T

In [None]:
#time= go.Scatter(x=Train.date,
                 #y=Train.time_spent.values)
                
#layout = go.Layout(title='Time spent on buying', xaxis=dict(title='Date'),
                   #yaxis=dict(title='(time seconds)'))

#fig = go.Figure(data=[time], layout=layout)
#py.iplot(fig, filename='h2o-plots')