In [1]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [2]:
import pandas as pd
import numpy as np

In [3]:
dtype_df_train_treated = {
"userId" : 'string',
"userType" : 'category',
"historySize" : 'UInt16',
"history" : 'string',
"timestampHistory" : 'string',
"numberOfClicksHistory" : 'UInt32',
"timeOnPageHistory" : 'UInt64',
"scrollPercentageHistory" : 'Float32',
"pageVisitsCountHistory" : 'UInt32',
"timestampHistory_new" : 'string'
}

In [4]:
# df_train = pd.read_csv(config["TREATED_TRAIN_CSV"], dtype=dtype_df_train_treated)
df_train = pd.read_csv(config["TREATED_TRAIN_CSV"], dtype=dtype_df_train_treated, nrows=500000)
df_train.drop(columns=["Unnamed: 0"],inplace=True)
df_train.head(2)

Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,userType
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.580002,1,Non-Logged
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,0,115232,73.360001,1,Non-Logged


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   userId                   500000 non-null  string  
 1   history                  500000 non-null  string  
 2   numberOfClicksHistory    500000 non-null  UInt32  
 3   timeOnPageHistory        500000 non-null  UInt64  
 4   scrollPercentageHistory  500000 non-null  Float32 
 5   pageVisitsCountHistory   500000 non-null  UInt32  
 6   userType                 500000 non-null  category
dtypes: Float32(1), UInt32(2), UInt64(1), category(1), string(2)
memory usage: 19.6 MB


In [6]:
df_train.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory
count,500000.0,500000.0,500000.0,500000.0
mean,12.15158,89593.384512,48.63739,1.149904
std,30.664431,110810.268458,1081.962036,1.219241
min,0.0,5000.0,0.72,1.0
25%,0.0,29764.75,27.1,1.0
50%,1.0,60000.0,43.52,1.0
75%,14.0,115901.5,60.540001,1.0
max,3732.0,12975738.0,466698.21875,211.0


In [7]:
#filtrar por noticias
THRESHOLD_RARE_NEWS = 0

clicks_counts = pd.DataFrame(df_train["history"].value_counts())
rare_news = clicks_counts[df_train["history"].value_counts() <= THRESHOLD_RARE_NEWS].index
common_news = df_train[~df_train["history"].isin(rare_news)]

In [8]:
common_news.head(3)

Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,userType
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.580002,1,Non-Logged
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,0,115232,73.360001,1,Non-Logged
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,19ba89fc-1e06-4c5d-9c57-4a3088dc0511,68,131495,51.740002,1,Non-Logged


In [9]:
common_news.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory
count,500000.0,500000.0,500000.0,500000.0
mean,12.15158,89593.384512,48.63739,1.149904
std,30.664431,110810.268458,1081.962036,1.219241
min,0.0,5000.0,0.72,1.0
25%,0.0,29764.75,27.1,1.0
50%,1.0,60000.0,43.52,1.0
75%,14.0,115901.5,60.540001,1.0
max,3732.0,12975738.0,466698.21875,211.0


In [10]:
common_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   userId                   500000 non-null  string  
 1   history                  500000 non-null  string  
 2   numberOfClicksHistory    500000 non-null  UInt32  
 3   timeOnPageHistory        500000 non-null  UInt64  
 4   scrollPercentageHistory  500000 non-null  Float32 
 5   pageVisitsCountHistory   500000 non-null  UInt32  
 6   userType                 500000 non-null  category
dtypes: Float32(1), UInt32(2), UInt64(1), category(1), string(2)
memory usage: 19.6 MB


In [11]:
def get_outliers_bounds(df, column_name):
    min = df.loc[:, column_name].min()
    max = df.loc[:, column_name].max()
    Q1 = df.loc[:, column_name].quantile(0.25)
    Q3 = df.loc[:, column_name].quantile(0.75)
    irq = Q3 - Q1
    lower_bound = Q1 - 1.5*irq
    if lower_bound < min:
        lower_bound = min
    upper_bound = Q3 + 1.5*irq
    if upper_bound > max:
        upper_bound = max
    return (lower_bound, upper_bound)

def filter_outliers(df, column_name, lower_bound = None, upper_bound = None):
    (lb, ub) = get_outliers_bounds(df, column_name)
    if lower_bound is None:
        lower_bound = lb
    if upper_bound is None:
        upper_bound = ub
    
    print(lower_bound, upper_bound)
    df = df.loc[df[column_name] >= lower_bound]
    df = df.loc[df[column_name] <= upper_bound]
    return df

In [12]:
common_news = filter_outliers(common_news, 'scrollPercentageHistory', lower_bound=None, upper_bound=100.0)
common_news = filter_outliers(common_news, 'pageVisitsCountHistory')
common_news = filter_outliers(common_news, 'timeOnPageHistory')
common_news = filter_outliers(common_news, 'numberOfClicksHistory')



0.72 100.0
1.0 1.0
5000 234069.5
0 30.0


In [13]:
common_news.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory
count,389626.0,389626.0,389626.0,389626.0
mean,4.780656,60778.25285,41.867359,1.0
std,7.58837,48892.102704,22.162825,0.0
min,0.0,5000.0,0.72,1.0
25%,0.0,21310.5,24.709999,1.0
50%,1.0,47686.0,40.599998,1.0
75%,7.0,85721.0,56.827501,1.0
max,30.0,234068.0,100.0,1.0


In [14]:
common_news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 389626 entries, 0 to 499999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   userId                   389626 non-null  string  
 1   history                  389626 non-null  string  
 2   numberOfClicksHistory    389626 non-null  UInt32  
 3   timeOnPageHistory        389626 non-null  UInt64  
 4   scrollPercentageHistory  389626 non-null  Float32 
 5   pageVisitsCountHistory   389626 non-null  UInt32  
 6   userType                 389626 non-null  category
dtypes: Float32(1), UInt32(2), UInt64(1), category(1), string(2)
memory usage: 18.2 MB


In [None]:
common_news.loc[:,'numberOfClicksHistoryLog'] = common_news.loc[:,'numberOfClicksHistory']
common_news['numberOfClicksHistoryLog'] = common_news['numberOfClicksHistoryLog'].apply(lambda x : np.log(x+1e-4))

common_news.describe()

In [None]:
min_num_of_clicks_history_log = common_news.loc[:, 'numberOfClicksHistoryLog'].min()
max_num_of_clicks_history_log = common_news.loc[:, 'numberOfClicksHistoryLog'].max()

common_news.loc[:,'numberOfClicksHistoryNormalized'] = common_news.loc[:,'numberOfClicksHistoryLog']

# normalized_min_max=(df-df.min())/(df.max()-df.min())
common_news['numberOfClicksHistoryNormalized'] = common_news['numberOfClicksHistoryNormalized'].apply(lambda x : (x-min_num_of_clicks_history_log)/(max_num_of_clicks_history_log-min_num_of_clicks_history_log))
common_news.describe()

In [None]:
common_news.loc[:,'timeOnPageHistoryLog'] = common_news.loc[:,'timeOnPageHistory']
common_news['timeOnPageHistoryLog'] = common_news['timeOnPageHistoryLog'].apply(lambda x : np.log(x+1e-4))

common_news.describe()

In [None]:
min_time_on_page_history_log = common_news.loc[:, 'timeOnPageHistoryLog'].min()
max_time_on_page_history_log = common_news.loc[:, 'timeOnPageHistoryLog'].max()

common_news.loc[:,'timeOnPageHistoryNormalized'] = common_news.loc[:,'timeOnPageHistoryLog']

# normalized_min_max=(df-df.min())/(df.max()-df.min())
common_news['timeOnPageHistoryNormalized'] = common_news['timeOnPageHistoryNormalized'].apply(lambda x : (x-min_time_on_page_history_log)/(max_time_on_page_history_log-min_time_on_page_history_log))
common_news.describe()