In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



**The aim of this project is to define an easy framework for regression problems**

**Define a mesurable objectif :**

* Predict the transaction value (continue value)
* Metrics :  - Root Mean Squared Error (RMSE): mean_squared_error(y_true, y_pred, squared=False) #give more importance to errors
* Objectif : RMSE < 2

**We start by reading the data : Some columns are in Json format, we will use a code that almost all kernels are using and for this work we will use only 10% of the initial data to experiment many machine learning algorithm**

In [None]:
# Necessary librarys
import random # random is to generate random values

# library of datetime
from datetime import datetime

import matplotlib.pyplot as plt # to graphics plot
import seaborn as sns # a good library to graphic plots

import json # to convert json in df
from pandas import json_normalize # to normalize the json file

In [None]:
#Code to transform the json format columns in table
def json_read(df):

    columns = ['device', 'geoNetwork', 'totals', 'trafficSource'] # Columns that have json format

    # p is a fractional number to skiprows and read just a random sample of the our dataset.
    p = 0.1 # *** In this case we will use 10% of data set *** #

    #joining the [ path + df received]
    data_frame = df

    #Importing the dataset
    random.seed(0)
    df = pd.read_csv(data_frame,
                     converters={column: json.loads for column in columns}, # loading the json columns properly
                     dtype={'fullVisitorId': 'str'}, # transforming this column to string
                     skiprows=lambda i: i>0 and random.random() > p # Number of rows that will be imported randomly
                     )

    for column in columns: #loop to finally transform the columns in data frame
        #It will normalize and set the json to a table
        column_as_df = json_normalize(df[column])
        # here will be set the name using the category and subcategory of json columns
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        # after extracting the values, let drop the original columns
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

    # Printing the shape of dataframes that was imported
    print(f"Loaded {os.path.basename(data_frame)}. Shape: {df.shape}")
    return df # returning the df after importing and transforming


# This function is to extract date features
def date_process(df):
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d") # seting the column as pandas datetime
    df["_weekday"] = df['date'].dt.weekday #extracting week day
    df["_day"] = df['date'].dt.day # extracting day
    df["_month"] = df['date'].dt.month # extracting day
    df["_year"] = df['date'].dt.year # extracting day
    df['_visitHour'] = (df['visitStartTime'].apply(lambda x: str(datetime.fromtimestamp(x).hour))).astype(int)

    return df #returning the df after the transformations

def Read_data():

    # We will import the data using the name and extension that will be concatenated with dir_path
    data = json_read("/kaggle/input/ga-customer-revenue-prediction/train.csv")
    data = date_process(data)

    return data

In [None]:
data = Read_data()
data.head()


**EDA (Exploratory Data Analysis)**

In [None]:
def details_missing_columns(df):
    '''
    This function will help us to define the details of the missing values
    '''
    total = df.isna().sum().sort_values(ascending = False) # getting the sum of null values and ordering
    percent = (df.isna().sum() / df.isna().count() * 100 ).sort_values(ascending = False) #getting the percent and order of null
    state_missing_value = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) # Concatenating the total and percent
    print("\nTotal missing value columns: ")
    print(state_missing_value) # Returning values of nulls different of 0

def missing_values_columns(df):
    '''
    This function will help us to detect the columns with more than 50% missing rate
    '''
    hight_missing_value = [col for col in df.columns if (df[col].isna().sum() / df[col].isna().count() * 100) > 50] #missing column more than 50%
    print('\nNumber of high missing columns: ', len(hight_missing_value))
    print('high missing columns rate: ', hight_missing_value)

def missing_values_rows(df):
    '''
    This function will help us to detect rows with more than 50% of missing rate
    '''
    retain_row = []
    for index, row in df.iterrows():
        #rows with more than 50% of missing values
        if (df.loc[index, :].isna().sum())/df.shape[1] >= 0.5:
            retain_row.append(index)

    print('\nThe number of hight missing rows: ', len(retain_row))
    print('high missing rows rate: ', retain_row)

def unique_columns(df):

    # all columns where we have a unique value (constants)
    # It is useful because this columns give us none information
    discovering_consts = [col for col in df.columns if df[col].nunique() == 1]

    # printing the total of columns dropped and the name of columns
    print("\nNumber of columns with just one value: ", len(discovering_consts), "columns")
    print("Name of constant columns: \n", discovering_consts)


def Shape_size_columns(df):

    # all columns where we have the same size than shape like id
    # It is useful because this columns give us none information
    discovering_shape = [col for col in df.columns if df[col].nunique() == df.shape[0]]

    # printing the total of columns dropped and the name of columns
    print("\nNumber of columns with shape size: ", len(discovering_shape), "columns")
    print("Name of constant columns: \n", discovering_shape)

In [None]:
def Form_analysis(data):

    df = data.copy()

    print(f'Number of ligne : {df.shape[0]} / number of columns : {df.shape[1]}')

    print('\n types of variables :\n')
    print(df.dtypes.value_counts())

    #unique columns
    unique_columns(df)

    #shape shape size columns
    Shape_size_columns(df)

    #missing value on columns
    missing_values_columns(df)

    #missing values on rows
    missing_values_rows(df)
    

    details_missing_columns(df)
    

In [None]:
Form_analysis(data)

Based on this first analysis we can delete the colomns with hight missing rate or with no information add

In [None]:
    #1. Drop unique columns
    to_drop_unique_columns = ['socialEngagementType', 'device.browserVersion', 'device.browserSize', 'device.operatingSystemVersion',
     'device.mobileDeviceBranding', 'device.mobileDeviceModel', 'device.mobileInputSelector', 'device.mobileDeviceInfo',
     'device.mobileDeviceMarketingName', 'device.flashVersion', 'device.language', 'device.screenColors', 'device.screenResolution',
     'geoNetwork.cityId', 'geoNetwork.latitude', 'geoNetwork.longitude', 'geoNetwork.networkLocation', 'totals.visits', 'totals.bounces',
     'totals.newVisits', 'trafficSource.adwordsClickInfo.criteriaParameters', 'trafficSource.isTrueDirect',
     'trafficSource.adwordsClickInfo.adNetworkType', 'trafficSource.adwordsClickInfo.isVideoAd']

    data.drop(to_drop_unique_columns, axis=1, inplace=True)
    print("Total unique features dropped: ", len(to_drop_unique_columns))

    #2. Drop columns
    to_drop_columns = ['trafficSource.keyword', 'trafficSource.referralPath', 'trafficSource.adwordsClickInfo.page',
    'trafficSource.adwordsClickInfo.slot', 'trafficSource.adwordsClickInfo.gclId', 'trafficSource.adContent', 'fullVisitorId',
    'sessionId', 'visitId', 'visitStartTime', 'date', 'trafficSource.campaign', 'geoNetwork.metro', 'geoNetwork.region',
    'geoNetwork.networkDomain', 'geoNetwork.city', 'trafficSource.medium'  ]

    data.drop(to_drop_columns, axis=1, inplace=True)
    print("Total features dropped: ", len(to_drop_columns))

    print('\n')

    print("Shape after dropping: ", data.shape)


Let's analyze one more time our data :

In [None]:
Form_analysis(data)

We keep 'totals.transactionRevenue' columns because it is our target, it is normal that this column has more than 98% of missing values because the conversion rate on E-commerce is very low. We will deal with the missing value after.

After this analysis we take a look in our data unique values to understand the data :

In [None]:
def knowningData(df, data_type=object, limit=10): #seting the function with df,
    n = df.select_dtypes(include=data_type) #selecting the desired data type
    for column in n.columns: #initializing the loop
        print("##############################################")
        print("Name of column ", column, ': \n', "Uniques: ", df[column].unique()[:limit], "\n",
              " | ## Total nulls: ", (round(df[column].isnull().sum() / len(df[column]) * 100,2)),
              " | ## Total unique values: ", df.nunique()[column]) #print the data and % of nulls)
        print("#############################################")

In [None]:
print("\n Object --------")
knowningData(data, data_type= object)
print("\n int --------")
knowningData(data, data_type= int)
print("\n float --------")
knowningData(data, data_type= float)
print("\n bool --------")
knowningData(data, data_type= bool)

Some numerical values should be integer like page views and hits, the nan transaction value will be set to 0 (nan mean no conversion : no transaction).

Some objectif feature get values like 'not set'

Let's clean and replace some values to get a better view on our data :

In [None]:
def Filling_Replacing_Values(df):
    # fillna numeric feature
    df['totals.pageviews'].fillna(df['totals.pageviews'].value_counts().index[0], inplace=True) #filling NA's with the most Frequent value
    df["totals.pageviews"] = df["totals.pageviews"].astype(int) # setting numerical to int
    df["totals.hits"] = df["totals.hits"].astype(int) # setting numerical to int
    df["totals.transactionRevenue"] = df["totals.transactionRevenue"].fillna(0.0).astype(float)

    #object feature
    # --> Replace unknown values
    df.loc[df['channelGrouping'] == '(Other)', 'channelGrouping'] = np.nan
    df.loc[df['device.operatingSystem'] == '(not set)', 'device.operatingSystem'] = np.nan
    df.loc[df['geoNetwork.continent'] == '(not set)', 'geoNetwork.continent'] = np.nan
  
    return df #return the transformed dataframe

In [None]:
    data = Filling_Replacing_Values(data)
    Form_analysis(data)

In [None]:
#to clean the data we drop nan rows
data.dropna(axis = 0, inplace=True)
print("Shape after dropping: ", data.shape)


In [None]:
# Printing some statistics of our target
print("Target Min Value: ", data["totals.transactionRevenue"].min()) # printing the min value
print("Target Mean Value: ", data["totals.transactionRevenue"].mean()) # mean value
print("Target Median Value: ", data["totals.transactionRevenue"].median()) # median value
print("Target Max Value: ", data["totals.transactionRevenue"].max()) # the max value

In [None]:
from sklearn.model_selection import train_test_split
#. Creation target - features
y = data['totals.transactionRevenue']
X = data.drop('totals.transactionRevenue', axis=1)

# train/test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


Now that we create our test and train set with 80% trainset and 20% testset we create our evaluation function :
-We use mainly RMSE evaluation where y hat is the natural log of the predicted revenue for a customer and y is the natural log of the actual summed revenue value plus one.
-RMSE is an appropriate metric hear because less than 2% of the custumers has transaction, so we want to amplify the error to not miss transactions

In [None]:
def evaluation(model, X_train, X_test, y_train, y_test):
    from sklearn.metrics import mean_squared_error
    from sklearn.model_selection import learning_curve
    
    print('\n ----- RMSE Evaluation : \n')

    y_test = np.log1p(y_test)
    y_train = np.log1p(y_train)

    model.fit(X_train, y_train)

    y_predict_test = model.predict(X_test)
    y_predict_train = model.predict(X_train)

    train_score = mean_squared_error(y_train, y_predict_train, squared=False)
    print('train score =', train_score)

    test_score = mean_squared_error(y_test, y_predict_test, squared=False)
    print('test score =', test_score)

    '''
    learning curve visualisation
    '''
    
    N, train_score, val_score = learning_curve(model, X_train, y_train, train_sizes = np.linspace(0.1, 1.0, 10), cv=4, scoring='neg_root_mean_squared_error')

    print(N)
    plt.plot(N,train_score.mean(axis=1), label='train')
    plt.plot(N,val_score.mean(axis=1), label='validation')
    plt.xlabel('train_sizes')
    plt.legend()
    plt.show()

    print('-----End Evaluation-----\n')


let's run our first model to understand more the features (We use StandarScaler and OrdinalEncoder to preprare the data) :

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder


#. Encoding
X_copy = X_train
X_copy_test = X_test
encoder = OrdinalEncoder()
encoder.fit(X)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)


preprocessor = make_pipeline(StandardScaler())
DTR = make_pipeline(preprocessor, DecisionTreeRegressor(random_state = 0))

evaluation(DTR, X_train, X_test, y_train, y_test)



We can see that we have an overfiting but this step will show us the most interesting features to use on our final training :

In [None]:
'''
features Selection
'''

IFe = DTR.named_steps["decisiontreeregressor"].feature_importances_
print(pd.DataFrame(IFe, index=X_copy.columns))
pd.DataFrame(IFe, index=X_copy.columns).plot.bar()
plt.show()

We can observe that there s some not importante features than we can delete to train the model :

In [None]:
    #select feature with less than 0.01 importance
    selection = X.columns[IFe<0.01]
    print(selection)


let's drop these features and have beagin the training :

In [None]:
to_drop = ['device.browser', 'device.deviceCategory', 'geoNetwork.continent',
       'geoNetwork.subContinent', '_year']

X_train = pd.DataFrame(data = X_train, index = X_copy.index, columns = X_copy.columns)
X_test = pd.DataFrame(data = X_test, index = X_copy_test.index, columns = X_copy_test.columns)

X_train.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)

from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

Ridge_algo = make_pipeline(preprocessor, Ridge(random_state = 0, alpha=.5))
SVR = make_pipeline(preprocessor, LinearSVR(random_state = 0))
NKR = make_pipeline(preprocessor, KNeighborsRegressor(n_jobs = -1))
BOOST = make_pipeline(preprocessor, GradientBoostingRegressor(random_state = 0))

dict_of_models = {'Ridge':Ridge_algo,
                  'SVR':SVR,
                  'NKR':NKR,
                  'DTR':DTR,
                  'BOOST':BOOST}
    
for name, model in dict_of_models.items():
    print('Evaluation of -----',name)
    evaluation(model, X_train, X_test, y_train, y_test)


GradientBoostingRegressor seem to be the most promising algorithm, let's focus on him :

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

#define our own mse and set greater_is_better=True (to get the opposite of neg_mse)
mse = make_scorer(mean_squared_error, greater_is_better=True)

def FocusOn_BOOST(BOOST, X_train, X_test, y_train, y_test):
    hyper_params = {'gradientboostingregressor__random_state': [0],
                    'gradientboostingregressor__loss': ['ls', 'lad', 'huber', 'quantile'],
                    'gradientboostingregressor__learning_rate': [0.05, 0.1, 0.2],
                    'gradientboostingregressor__n_estimators': [100, 200]}

    grid = GridSearchCV(BOOST, hyper_params, scoring=mse, cv=4, n_jobs = -1)

    grid.fit(X_train, y_train)

    #best parameter :
    print(grid.best_params_)
    print(grid.best_score_)

    model = grid.best_estimator_

    evaluation(model, X_train, X_test, y_train, y_test)

#call the function
FocusOn_BOOST(BOOST, X_train, X_test, y_train, y_test)
    


**CC:**

We get a small improvement on our train set and same results on testset.
Our objectif is ok with an RMSE < 2

The learning curve shows us that we can improve the results with more data.