In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#reading dataset
df_train = pd.read_csv("/kaggle/input/ga-customer-revenue-prediction/train.csv")
df_train.head()


In [None]:
#convert json columns

import os
import json
from pandas.io.json import json_normalize

def load_df(csv_path='/kaggle/input/ga-customer-revenue-prediction/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [None]:


train_df = load_df('/kaggle/input/ga-customer-revenue-prediction/train.csv')

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
#better description for dataset

from scipy import stats


def DataDesc(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 
    
    return summary

In [None]:
DataDesc(train_df)


In [None]:
train_df.isnull().sum()

In [None]:
#imputation of null values and converting the columns values to int

def fill_na(df):   
    df['totals.pageviews'].fillna(1, inplace=True)
    df['totals.newVisits'].fillna(0, inplace=True)
    df['totals.bounces'].fillna(0, inplace=True) 
    df["totals.transactionRevenue"].fillna(0.0, inplace=True)
    
    # Changing datatypes from object to desired ones
    df['totals.pageviews'] = df['totals.pageviews'].astype(int)
    df['totals.newVisits'] = df['totals.newVisits'].astype(int)
    df['totals.bounces'] = df['totals.bounces'].astype(int)
    df["totals.transactionRevenue"] = df["totals.transactionRevenue"].astype(float)
    
    
    df['trafficSource.isTrueDirect'].fillna(False, inplace=True) 
    df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True) # filling boolean with True
    df[train_df['geoNetwork.city'] == "(not set)"]['geoNetwork.city'] = np.nan
    df['geoNetwork.city'].fillna("NaN", inplace=True)
    
    return df

df = fill_na(train_df)

In [None]:
DataDesc(df)

In [None]:
#these columns has very large amount of null values so its better to drop it

df.drop(columns=['trafficSource.adContent', 'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.gclId',
                'trafficSource.adwordsClickInfo.slot', 'trafficSource.adwordsClickInfo.page', 'trafficSource.referralPath',
                 'trafficSource.keyword'], inplace = True)


In [None]:
for col in df.columns:
    if len(df[col].unique()) == 1:
        df.drop(col,inplace=True,axis=1)

In [None]:
DataDesc(df)

In [None]:
#converting to float

df['fullVisitorId'] = df['fullVisitorId'].astype(float)

df['sessionId'] = df['sessionId'].astype(float)

In [None]:
# This function is to extract date features

from datetime import datetime


def date_process(df):
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d") # seting the column as pandas datetime
    df["weekday"] = df['date'].dt.weekday #extracting week day
    df["day"] = df['date'].dt.day # extracting day
    df["month"] = df['date'].dt.month # extracting day
    df["year"] = df['date'].dt.year # extracting day
    df['visitHour'] = (df['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)
    
    return df
df = date_process(df)


In [None]:
df.head()

In [None]:
#visalising the broweser column and revenue column

import matplotlib.pyplot as plt
import seaborn as sns

df.groupby('device.browser')['totals.transactionRevenue'].count().plot(kind='barh', figsize=(9,9))  


In [None]:
#visualising Operating system vs Revenue


df.groupby('device.operatingSystem')['totals.transactionRevenue'].count().plot(kind='barh', figsize=(5,5))  


In [None]:
#visualising ismobile vs Revenue


df.groupby('device.isMobile')['totals.transactionRevenue'].count().plot(kind='barh', figsize=(3,3))  


In [None]:
#visualising continent vs Revenue


df.groupby('geoNetwork.continent')['totals.transactionRevenue'].count().plot(kind='barh', figsize=(3,3))  


In [None]:

#visualising subcontinent vs Revenue

df.groupby('geoNetwork.subContinent')['totals.transactionRevenue'].count().plot(kind='barh', figsize=(5,5))  


In [None]:
#visualising geonetworkcity vs Revenue


df.groupby(['geoNetwork.country'])['totals.transactionRevenue'].count().plot()


In [None]:
#visualising pageviews vs Revenue


df.groupby(['totals.pageviews'])['totals.transactionRevenue'].count().plot()



In [None]:
#visualising date vs Revenue


df.groupby(['date'])['totals.transactionRevenue'].count().plot(figsize = (7,7))


In [None]:
#visualising channel grouping vs Revenue

df.groupby(['channelGrouping'])['totals.transactionRevenue'].count().plot(kind='barh')


In [None]:
##visualising day vs Revenue


df.groupby(['day'])['totals.transactionRevenue'].count().plot()


In [None]:

#visualising weekday vs Revenue

df.groupby(['weekday'])['totals.transactionRevenue'].count().plot()


In [None]:
#visualising month vs Revenue

df.groupby(['month'])['totals.transactionRevenue'].count().plot()


In [None]:
#visualising visit hour vs Revenue

df.groupby(['visitHour'])['totals.transactionRevenue'].count().plot()


In [None]:
#visualising the distribution of Revenue column... we can see outliers... we need to sort it out after encoding

sns.set_color_codes()
ax = sns.distplot([df['totals.transactionRevenue']>0], color="y", bins=2)

In [None]:
from sklearn import model_selection, preprocessing, metrics
import lightgbm as lgb


# Impute 0 for missing target values
df["totals.transactionRevenue"].fillna(0, inplace=True)
train_y = df["totals.transactionRevenue"].values
train_id = df["fullVisitorId"].values


In [None]:
df.columns

In [None]:
# label encode the categorical variables 

cat_cols = ['channelGrouping','device.browser',
       'device.operatingSystem', 'device.isMobile', 'device.deviceCategory',
       'geoNetwork.continent', 'geoNetwork.subContinent', 'geoNetwork.country',
       'geoNetwork.region', 'geoNetwork.metro', 'geoNetwork.city',
       'geoNetwork.networkDomain','trafficSource.campaign', 'trafficSource.source',
       'trafficSource.medium', 'trafficSource.isTrueDirect',
       'trafficSource.adwordsClickInfo.isVideoAd',
       'trafficSource.campaignCode']

for col in cat_cols:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(df[col].values.astype('str')))
    df[col] = lbl.transform(list(df[col].values.astype('str')))



In [None]:
#converting these columns to float
num_cols = ["totals.hits", "totals.pageviews", "visitNumber", "visitStartTime", 'totals.bounces',  'totals.newVisits']    
for col in num_cols:
    df[col] = df[col].astype(float)

In [None]:
#checking datatypes of the columns
df.info()