In [1]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [2]:
import pandas as pd

In [3]:
dtype_df_train_treated = {
"userId" : 'string',
"userType" : 'category',
"historySize" : 'UInt16',
"history" : 'string',
"timestampHistory" : 'string',
"numberOfClicksHistory" : 'UInt32',
"timeOnPageHistory" : 'UInt64',
"scrollPercentageHistory" : 'Float32',
"pageVisitsCountHistory" : 'UInt32',
"timestampHistory_new" : 'string'
}

In [4]:
df_train = pd.read_csv(config["TREATED_TRAIN_CSV"], dtype=dtype_df_train_treated)
# df_train = pd.read_csv(config["TREATED_TRAIN_CSV"], dtype=dtype_df_train_treated, nrows=500000)
df_train.drop(columns=["Unnamed: 0"],inplace=True)
df_train.head(2)

Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,userType
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.580002,1,Non-Logged
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,0,115232,73.360001,1,Non-Logged


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8123951 entries, 0 to 8123950
Data columns (total 7 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   userId                   string  
 1   history                  string  
 2   numberOfClicksHistory    UInt32  
 3   timeOnPageHistory        UInt64  
 4   scrollPercentageHistory  Float32 
 5   pageVisitsCountHistory   UInt32  
 6   userType                 category
dtypes: Float32(1), UInt32(2), UInt64(1), category(1), string(2)
memory usage: 317.7 MB


In [6]:
df_train.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory
count,8123951.0,8123951.0,8123951.0,8123951.0
mean,12.053646,88768.68892,54.005501,1.136566
std,37.3863,113932.180358,1832.254883,1.084772
min,0.0,5000.0,0.02,1.0
25%,0.0,29870.0,27.08,1.0
50%,1.0,60000.0,43.23,1.0
75%,14.0,114582.0,59.93,1.0
max,14562.0,46033049.0,526275.5625,654.0


In [7]:
#filtrar por noticias
THRESHOLD_RARE_NEWS = 0

clicks_counts = pd.DataFrame(df_train["history"].value_counts())
rare_news = clicks_counts[df_train["history"].value_counts() <= THRESHOLD_RARE_NEWS].index
common_news = df_train[~df_train["history"].isin(rare_news)]

In [8]:
common_news.head(3)

Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory,userType
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0,71998,81.580002,1,Non-Logged
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,0,115232,73.360001,1,Non-Logged
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,19ba89fc-1e06-4c5d-9c57-4a3088dc0511,68,131495,51.740002,1,Non-Logged


In [9]:
common_news.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory
count,8123951.0,8123951.0,8123951.0,8123951.0
mean,12.053646,88768.68892,54.005501,1.136566
std,37.3863,113932.180358,1832.254883,1.084772
min,0.0,5000.0,0.02,1.0
25%,0.0,29870.0,27.08,1.0
50%,1.0,60000.0,43.23,1.0
75%,14.0,114582.0,59.93,1.0
max,14562.0,46033049.0,526275.5625,654.0


In [10]:
common_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8123951 entries, 0 to 8123950
Data columns (total 7 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   userId                   string  
 1   history                  string  
 2   numberOfClicksHistory    UInt32  
 3   timeOnPageHistory        UInt64  
 4   scrollPercentageHistory  Float32 
 5   pageVisitsCountHistory   UInt32  
 6   userType                 category
dtypes: Float32(1), UInt32(2), UInt64(1), category(1), string(2)
memory usage: 317.7 MB


In [11]:
def get_outliers_bounds(df, column_name):
    min = df.loc[:, column_name].min()
    max = df.loc[:, column_name].max()
    Q1 = df.loc[:, column_name].quantile(0.25)
    Q3 = df.loc[:, column_name].quantile(0.75)
    irq = Q3 - Q1
    lower_bound = Q1 - 1.5*irq
    if lower_bound < min:
        lower_bound = min
    upper_bound = Q3 + 1.5*irq
    if upper_bound > max:
        upper_bound = max
    return (lower_bound, upper_bound)

def filter_outliers(df, column_name, lower_bound = None, upper_bound = None):
    (lb, ub) = get_outliers_bounds(df, column_name)
    if lower_bound is None:
        lower_bound = lb
    if upper_bound is None:
        upper_bound = ub
    
    print(lower_bound, upper_bound)
    df = df.loc[df[column_name] >= lower_bound]
    df = df.loc[df[column_name] <= upper_bound]
    return df

In [12]:
common_news = filter_outliers(common_news, 'scrollPercentageHistory', lower_bound=None, upper_bound=100.0)
common_news = filter_outliers(common_news, 'pageVisitsCountHistory')
common_news = filter_outliers(common_news, 'timeOnPageHistory')
common_news = filter_outliers(common_news, 'numberOfClicksHistory')



0.02 100.0
1.0 1.0
5000 234101.0
0 30.0


In [13]:
common_news.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory
count,6349896.0,6349896.0,6349896.0,6349896.0
mean,4.733215,60602.219329,41.4767,1.0
std,7.590275,48749.808355,21.818096,0.0
min,0.0,5000.0,0.02,1.0
25%,0.0,21235.0,24.620001,1.0
50%,1.0,47627.0,40.349998,1.0
75%,6.0,85138.0,56.27,1.0
max,30.0,234101.0,100.0,1.0


In [14]:
common_news.drop(columns=["pageVisitsCountHistory"],inplace=True)

In [15]:
common_news.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6349896 entries, 0 to 8123949
Data columns (total 6 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   userId                   string  
 1   history                  string  
 2   numberOfClicksHistory    UInt32  
 3   timeOnPageHistory        UInt64  
 4   scrollPercentageHistory  Float32 
 5   userType                 category
dtypes: Float32(1), UInt32(1), UInt64(1), category(1), string(2)
memory usage: 266.5 MB


In [16]:
common_news.to_csv(config["DF_TRAIN_CLEANED_CSV"])