### Imports

In [1]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [2]:
import pandas as pd
import numpy as np

### Optimal Schema

In [3]:
dtype_df_train_cleaned_treated = {
"userId" : 'string',
"history" : 'string',
"numberOfClicksHistory" : 'UInt32',
"timeOnPageHistory" : 'UInt64',
"scrollPercentageHistory" : 'Float32',
"userType" : 'category'
}

### Reading CSV with filtered outliers

In [None]:
df_train_cleaned = pd.read_csv(config["DF_TRAIN_CLEANED_CSV"], dtype=dtype_df_train_cleaned_treated)
df_train_cleaned.drop(columns=["Unnamed: 0"],inplace=True)
df_train_cleaned.head(2)

Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,userType
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.580002,Non-Logged
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,0,115232,73.360001,Non-Logged


In [5]:
df_train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6349896 entries, 0 to 6349895
Data columns (total 6 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   userId                   string  
 1   history                  string  
 2   numberOfClicksHistory    UInt32  
 3   timeOnPageHistory        UInt64  
 4   scrollPercentageHistory  Float32 
 5   userType                 category
dtypes: Float32(1), UInt32(1), UInt64(1), category(1), string(2)
memory usage: 218.0 MB


In [6]:
df_train_cleaned.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory
count,6349896.0,6349896.0,6349896.0
mean,4.733215,60602.219329,41.4767
std,7.590275,48749.808355,21.818096
min,0.0,5000.0,0.02
25%,0.0,21235.0,24.620001
50%,1.0,47627.0,40.349998
75%,6.0,85138.0,56.27
max,30.0,234101.0,100.0


### Functions to normalize and linearize data

In [7]:
def normalize_min_max(x, xmin, xmax):
    return (x-xmin)/(xmax-xmin)

def treat_number_of_clicks_history(x):
    return np.log(x+1)

### numberOfClicksHistory
* First, we linearize it though a log transformation.
* We then apply min-max to obtain normalized values form 0 to 1.

In [8]:
df_train_cleaned['numberOfClicksHistory'] = df_train_cleaned['numberOfClicksHistory'].apply(treat_number_of_clicks_history)
min_num_of_clicks_history_log = df_train_cleaned.loc[:, 'numberOfClicksHistory'].min()
max_num_of_clicks_history_log = df_train_cleaned.loc[:, 'numberOfClicksHistory'].max()
df_train_cleaned['numberOfClicksHistory'] = df_train_cleaned['numberOfClicksHistory'].apply(normalize_min_max,xmin=min_num_of_clicks_history_log, xmax=max_num_of_clicks_history_log)

df_train_cleaned.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory
count,6349896.0,6349896.0,6349896.0
mean,0.2938048,60602.219329,41.4767
std,0.3365609,48749.808355,21.818096
min,0.0,5000.0,0.02
25%,0.0,21235.0,24.620001
50%,0.2018491,47627.0,40.349998
75%,0.566662,85138.0,56.27
max,1.0,234101.0,100.0


### timeOnPageHistory
* Apply min-max to obtain normalized values form 0 to 1.

In [9]:
min_time_on_page_history = df_train_cleaned.loc[:, 'timeOnPageHistory'].min()
max_time_on_page_history = df_train_cleaned.loc[:, 'timeOnPageHistory'].max()
df_train_cleaned['timeOnPageHistory'] = df_train_cleaned['timeOnPageHistory'].apply(normalize_min_max,xmin=min_time_on_page_history, xmax=max_time_on_page_history)

df_train_cleaned.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory
count,6349896.0,6349896.0,6349896.0
mean,0.2938048,0.2426974,41.4767
std,0.3365609,0.2127874,21.818096
min,0.0,0.0,0.02
25%,0.0,0.07086394,24.620001
50%,0.2018491,0.186062,40.349998
75%,0.566662,0.3497933,56.27
max,1.0,1.0,100.0


### scrollPercentageHistory
* Just divide the percentage value by 100 to obtain normalized from 0 to 1.

In [10]:
df_train_cleaned['scrollPercentageHistory'] = df_train_cleaned['scrollPercentageHistory']/100

df_train_cleaned.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory
count,6349896.0,6349896.0,6349896.0
mean,0.2938048,0.2426974,0.414767
std,0.3365609,0.2127874,0.218181
min,0.0,0.0,0.0002
25%,0.0,0.07086394,0.2462
50%,0.2018491,0.186062,0.4035
75%,0.566662,0.3497933,0.5627
max,1.0,1.0,1.0


### Save Results to CSV

In [11]:
df_train_cleaned.to_csv(config["DF_TRAIN_FEATURES_CSV"])