In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#reading dataset
df_train = pd.read_csv("/kaggle/input/ga-customer-revenue-prediction/train.csv")


#reading dataset
df_test = pd.read_csv("/kaggle/input/ga-customer-revenue-prediction/test.csv")




In [None]:
#convert json columns

import os
import json
from pandas.io.json import json_normalize

def load_df(csv_path='/kaggle/input/ga-customer-revenue-prediction/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [None]:
#load the train and test dataset

train_df = load_df('/kaggle/input/ga-customer-revenue-prediction/train.csv')


test_df = load_df('/kaggle/input/ga-customer-revenue-prediction/test.csv')

In [None]:
#better description for dataset

from scipy import stats


def DataDesc(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 
    
    return summary

In [None]:
#describe test data
DataDesc(test_df)


In [None]:
#describe train data
DataDesc(train_df)


In [None]:
#imputation of null values and converting the columns values to int in train dataset

def fill_na(df):   
    df['totals.pageviews'].fillna(1, inplace=True)
    df['totals.newVisits'].fillna(0, inplace=True)
    df['totals.bounces'].fillna(0, inplace=True) 
    df["totals.transactionRevenue"].fillna(0.0, inplace=True)
    
    # Changing datatypes from object to desired ones
    df['totals.pageviews'] = df['totals.pageviews'].astype(int)
    df['totals.newVisits'] = df['totals.newVisits'].astype(int)
    df['totals.bounces'] = df['totals.bounces'].astype(int)
    df["totals.transactionRevenue"] = df["totals.transactionRevenue"].astype(float)
    
    
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True) 
    df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # filling boolean with True
    df[train_df['geoNetwork.city'] == "(not set)"]['geoNetwork.city'] = np.nan
    df['geoNetwork.city'].fillna("NaN", inplace=True)
    
    return df

df = fill_na(train_df)

In [None]:
#imputation of null values and converting the columns values to int in test data

def fill_na(df1):   
    df1['totals.pageviews'].fillna(1, inplace=True)
    df1['totals.newVisits'].fillna(0, inplace=True)
    df1['totals.bounces'].fillna(0, inplace=True)
    
    # Changing datatypes from object to desired ones
    df1['totals.pageviews'] = df1['totals.pageviews'].astype(int)
    df1['totals.newVisits'] = df1['totals.newVisits'].astype(int)
    df1['totals.bounces'] = df1['totals.bounces'].astype(int)
    
    
    df1['trafficSource.isTrueDirect'].fillna(False, inplace=True) 
    df1['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # filling boolean with True
    df1[train_df['geoNetwork.city'] == "(not set)"]['geoNetwork.city'] = np.nan
    df1['geoNetwork.city'].fillna("NaN", inplace=True)
    
    return df1

df1 = fill_na(test_df)

In [None]:
df1.info()

In [None]:
#converting to float fullVisitorId & sessionId to float

df['fullVisitorId'] = df['fullVisitorId'].astype(float)

df['sessionId'] = df['sessionId'].astype(float)



df1['fullVisitorId'] = df1['fullVisitorId'].astype(float)

df1['sessionId'] = df1['sessionId'].astype(float)

In [None]:
df1.info()

In [None]:
#Removing columns with unique values in train

for col in df.columns:
    if len(df[col].unique()) == 1:
        df.drop(col,inplace=True,axis=1)

In [None]:
#Removing columns with unique values in train

for col in df1.columns:
    if len(df1[col].unique()) == 1:
        df1.drop(col,inplace=True,axis=1)

In [None]:
df1.info()

In [None]:
# This function is to extract date features in train

from datetime import datetime


def date_process(df):
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d") # seting the column as pandas datetime
    df["weekday"] = df['date'].dt.weekday #extracting week day
    df["day"] = df['date'].dt.day # extracting day
    df["month"] = df['date'].dt.month # extracting day
    df["year"] = df['date'].dt.year # extracting day
    df['visitHour'] = (df['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)
    
    return df
df = date_process(df)


In [None]:
# This function is to extract date features in test

from datetime import datetime


def df1_date(df1):
    df1["date"] = pd.to_datetime(df1["date"], format="%Y%m%d") # seting the column as pandas datetime
    df1["weekday"] = df1['date'].dt.weekday #extracting week day
    df1["day"] = df1['date'].dt.day # extracting day
    df1["month"] = df1['date'].dt.month # extracting day
    df1["year"] = df1['date'].dt.year # extracting day
    df1['visitHour'] = (df1['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)
    
    return df1
df1 = df1_date(df1)


In [None]:
df.info()

In [None]:
df1.info()

In [None]:
# check Variables not in test but in train 

print("Variables not in test but in train : ", set(df.columns).difference(set(df1.columns)))


In [None]:
#Drop session id and trafficSource.campaignCode as they give no value to revenuw

df = df.drop(['sessionId', 'trafficSource.campaignCode'], axis = 1)
df1 = df1.drop(['sessionId'], axis = 1)


In [None]:
#fill totals.transactionRevenue columns with 0 for nan data and convert it to values.
#convert full visitor id to values

df["totals.transactionRevenue"].fillna(0, inplace=True)
train_y = df["totals.transactionRevenue"].values
train_id = df["fullVisitorId"].values
test_id = df1["fullVisitorId"].values


In [None]:
df1.info()

In [None]:
from sklearn import model_selection, preprocessing, metrics

# function to label encode the categorical variables 

df_cat = ["channelGrouping", "device.browser", 'device.isMobile',
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']






In [None]:
#label encoding the categorical variable in train and test

for col in df_cat:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(df[col].values.astype('str')) + list(df1[col].values.astype('str')))
    df[col] = lbl.transform(list(df[col].values.astype('str')))
    df1[col] = lbl.transform(list(df1[col].values.astype('str')))


In [None]:
#convert the these columns to float

num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits']    
for col in num_cols:
    df[col] = df[col].astype(float)
    df1[col] = df1[col].astype(float)


In [None]:
#check variable not in test but in train

print("Variables not in test but in train : ", set(df.columns).difference(set(df1.columns)))


In [None]:
df.info()

In [None]:
import datetime

#converting date to date time

df["date"] = pd.to_datetime(df["date"]).dt.date




In [None]:
X = df[df['date']<=datetime.date(2017,5,31)]
val_X = df[df['date']>datetime.date(2017,5,31)]
X = X.drop(['date'], axis = 1)
val_X = df.drop(['date'], axis = 1)
test_x = df.drop(['date'], axis = 1)

In [None]:
x = X.drop(['totals.transactionRevenue'], axis = 1)
val_x = val_X.drop(['totals.transactionRevenue'], axis = 1)


In [None]:
y = np.log1p((X["totals.transactionRevenue"]).values)
val_y = np.log1p((val_X["totals.transactionRevenue"]).values)

In [None]:
y = pd.DataFrame(y)
val_y = pd.DataFrame(val_y)

In [None]:
x.info()

In [None]:
#Feature selection for sessions and devices
# affects revenue -  Channelgrouping,  visitnumber, device browser, deviceOS, deviceis mobile, device category
# doesnt affects revenue - full visitid, visit id,visitstarttime

import matplotlib.pyplot as plt

device_session = x.iloc[:,0:9]

plt.figure(figsize=(10,30), facecolor='white')
plotnumber = 1

location
for column in device_session:
    if plotnumber<=12 :
        ax = plt.subplot(10,3,plotnumber)
        plt.scatter(device_session[column],y)
        plt.xlabel(column,fontsize=10)
        plt.ylabel('revenue',fontsize=10)
    plotnumber+=1
plt.tight_layout()

In [None]:
#Feature selection for totals columns
# affects revenue -  total page views, total hits, totals new visits
# doesnt affects revenue - bounces
import matplotlib.pyplot as plt

totals = x.iloc[:,16:20]

plt.figure(figsize=(10,10), facecolor='white')
plotnumber = 1

location
for column in totals:
    if plotnumber<=5 :
        ax = plt.subplot(3,3,plotnumber)
        plt.scatter(totals[column],y)
        plt.xlabel(column,fontsize=10)
        plt.ylabel('revenue',fontsize=10)
    plotnumber+=1
plt.tight_layout()

In [None]:
#Feature selection for traffic source columns
# affects revenue -  all these columns somewhat affects the revenue 


import matplotlib.pyplot as plt

traffic_source = x.iloc[:,20:32]

plt.figure(figsize=(10,30), facecolor='white')
plotnumber = 1

location
for column in traffic_source:
    if plotnumber<=12 :
        ax = plt.subplot(10,3,plotnumber)
        plt.scatter(traffic_source[column],y)
        plt.xlabel(column,fontsize=10)
        plt.ylabel('revenue',fontsize=10)
    plotnumber+=1
plt.tight_layout()

In [None]:
#Feature selection for geo location columns
# affects revenue -  all these columns some what affects revenue

import matplotlib.pyplot as plt

geo_location = x.iloc[:,9:16]

plt.figure(figsize=(10,30), facecolor='white')
plotnumber = 1

location
for column in geo_location:
    if plotnumber<=12 :
        ax = plt.subplot(10,3,plotnumber)
        plt.scatter(geo_location[column],y)
        plt.xlabel(column,fontsize=10)
        plt.ylabel('revenue',fontsize=10)
    plotnumber+=1
plt.tight_layout()

In [None]:
#Feature selection for totals columns
# affects revenue -  all columns except day
# doesnt affects revenue - day

import matplotlib.pyplot as plt

time = x.iloc[:,32:37]

plt.figure(figsize=(10,30), facecolor='white')
plotnumber = 1

location
for column in time:
    if plotnumber<=12 :
        ax = plt.subplot(10,3,plotnumber)
        plt.scatter(time[column],y)
        plt.xlabel(column,fontsize=10)
        plt.ylabel('revenue',fontsize=10)
    plotnumber+=1
plt.tight_layout()

In [None]:
#drop columns that doesnt affects revenue

x = x.drop(['day', 'totals.bounces', 'fullVisitorId', 'visitId', 'visitStartTime'], axis = 1)
val_x = val_x.drop(['day', 'totals.bounces', 'fullVisitorId', 'visitId', 'visitStartTime'], axis = 1)
test_x = test_x.drop(['day', 'totals.bounces', 'fullVisitorId', 'visitId', 'visitStartTime'], axis = 1)


In [None]:
X= train_x
Y= train_y
val_x = df1

In [None]:
X['date'].head()

In [None]:
df["date"] = pd.to_datetime(df["date"]).dt.date
df['date']

In [None]:
df1.info()

In [None]:
import lightgbm as lgb


def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "num_leaves" : 30,
        "min_child_samples" : 100,
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model_lgbm = lgb.train(params, lgtrain, 1000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=100)
    
    pred_test_y = model_lgbm.predict(test_X, num_iteration=model_lgbm.best_iteration)
    pred_val_y = model_lgbm.predict(val_X, num_iteration=model_lgbm.best_iteration)
    return pred_test_y, model_lgbm, pred_val_y

# Training the model #
pred_test, model_lgbm, pred_val = run_lgb(x, y, val_x, val_y, test_x)

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
lgb.plot_importance(model_lgbm, max_num_features=50, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize=15)
plt.show()

In [None]:
x = x.drop(['visitId', 'visitStartTime', 'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot'
           ,'trafficSource.campaign', 'trafficSource.adContent', 'device.deviceCategory', 'trafficSource.medium'], axis = 1)

val_x = val_x.drop(['visitId', 'visitStartTime', 'trafficSource.adwordsClickInfo.page', 'trafficSource.adwordsClickInfo.slot'
           ,'trafficSource.campaign', 'trafficSource.adContent', 'device.deviceCategory', 'trafficSource.medium'], axis = 1)

In [None]:

import xgboost as xg 
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV


n_estimators = [500,1000,2000]
max_depth= [7, 8, 9, 10]
learning_rate= [0.2, 0.3, 0.4]
colsample_bytree= [0.5, 0.6,0.8]
subsample= [0.5,0.7, 0.8]
scale_pos_weight=[1,1.5,2]
random_search = {'n_estimators': n_estimators, 'max_depth': max_depth, 'learning_rate': learning_rate,
                 'subsample': subsample, 'colsample_bytree':colsample_bytree, 'scale_pos_weight': scale_pos_weight}
               
Random = RandomizedSearchCV(estimator = xg.XGBRegressor(objective='reg:squarederror'),param_distributions = random_search,
                            verbose=1)

In [None]:
Random.fit(x,y)


In [None]:
Random.best_params_

In [None]:
import xgboost as xg 
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score


model = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 900, verbosity=1, learning_rate=0.25, max_depth=8,
                       subsample=0.5, colsample_bytree=0.5, scale_pos_weight=3)
model.fit(x, y) 


In [None]:
model.score(x,y)

In [None]:
y_train_predict = model.predict(x)
rmse = (np.sqrt(MSE(y, y_train_predict)))
r2 = r2_score(y, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set

y_test_predict = model.predict(val_x)
# root mean square error of the model
rmse = (np.sqrt(MSE(val_y, y_test_predict)))

# r-squared score of the model
r2 = r2_score(val_y, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

In [None]:
model.best_params_

In [None]:
from sklearn.metrics import mean_squared_error 
from math import sqrt

def adj_r2(x,y,r2):
    """
                Method Name: adj_r2
                Description: This method calculates adjusted r2 value
                Output: adjusted r2 score value
                On Failure: Raise Exception

                Written By: Chethan
                Version: 1.0
                
    """
    try:
        n = x.shape[0]
        p = x.shape[1]
        adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
        return adjusted_r2
    
    except Exception as e:
        raise Exception()

r2_score = model.score(x, y)
adj_r2_score = adj_r2 (x, y,r2_score)


In [None]:
adj_r2_score

In [None]:
feature_selec = df.drop(['date', 'totals.transactionRevenue'], axis = 1)
feature_selec.info()

In [None]:
model.get_booster().get_score(importance_type="gain")


In [None]:
feature_important = model.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.plot(kind='barh', figsize=(12,18))

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, max_num_features=50, height=0.8, ax=ax)
ax.grid(False)
plt.title("Feature Importance", fontsize=15)
plt.show()

In [None]:
from matplotlib import pyplot as plt

sorted_idx = model.feature_importances_.argsort()
plt.barh(df.feature_names[sorted_idx], model.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
    

In [None]:
f