In [1]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [2]:
import pandas as pd
import numpy as np

In [3]:
dtype_df_train_cleaned_treated = {
"userId" : 'string',
"history" : 'string',
"numberOfClicksHistory" : 'Float32',
"timeOnPageHistory" : 'Float32',
"scrollPercentageHistory" : 'Float32',
"userType" : 'category'
}

In [4]:
df_train_cleaned = pd.read_csv(config["DF_TRAIN_FEATURES_CSV"], dtype=dtype_df_train_cleaned_treated)
# df_train_cleaned = pd.read_csv(config["DF_TRAIN_FEATURES_CSV"], dtype=dtype_df_train_cleaned_treated, nrows=500000)
df_train_cleaned.drop(columns=["Unnamed: 0"],inplace=True)
df_train_cleaned.head(2)

Unnamed: 0,userId,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,userType
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,0.0,0.292439,0.8158,Non-Logged
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,0.0,0.48115,0.7336,Non-Logged


In [5]:
df_train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6349896 entries, 0 to 6349895
Data columns (total 6 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   userId                   string  
 1   history                  string  
 2   numberOfClicksHistory    Float32 
 3   timeOnPageHistory        Float32 
 4   scrollPercentageHistory  Float32 
 5   userType                 category
dtypes: Float32(3), category(1), string(2)
memory usage: 193.8 MB


In [6]:
df_train_cleaned.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory
count,6349896.0,6349896.0,6349896.0
mean,0.293805,0.242697,0.414767
std,0.336561,0.212787,0.218181
min,0.0,0.0,0.0002
25%,0.0,0.070864,0.2462
50%,0.201849,0.186062,0.4035
75%,0.566662,0.349793,0.5627
max,1.0,1.0,1.0


In [7]:
weight_n_clicks = 1
weight_time_on_page = 2
weight_scroll_percent = 2

df_train_cleaned['score'] = weight_n_clicks*df_train_cleaned['numberOfClicksHistory'] + weight_time_on_page*df_train_cleaned['timeOnPageHistory'] + weight_scroll_percent*df_train_cleaned['scrollPercentageHistory']
df_train_cleaned.describe()

Unnamed: 0,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,score
count,6349896.0,6349896.0,6349896.0,6349896.0
mean,0.293805,0.242697,0.414767,1.608734
std,0.336561,0.212787,0.218181,0.867295
min,0.0,0.0,0.0002,0.002373
25%,0.0,0.070864,0.2462,0.924991
50%,0.201849,0.186062,0.4035,1.56746
75%,0.566662,0.349793,0.5627,2.202917
max,1.0,1.0,1.0,4.990057


In [8]:
df_train_cleaned.drop(columns=["numberOfClicksHistory", "timeOnPageHistory", "scrollPercentageHistory"],inplace=True)

In [9]:
df_train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6349896 entries, 0 to 6349895
Data columns (total 4 columns):
 #   Column    Dtype   
---  ------    -----   
 0   userId    string  
 1   history   string  
 2   userType  category
 3   score     Float32 
dtypes: Float32(1), category(1), string(2)
memory usage: 133.2 MB


In [10]:
df_train_cleaned.describe()

Unnamed: 0,score
count,6349896.0
mean,1.608734
std,0.867295
min,0.002373
25%,0.924991
50%,1.56746
75%,2.202917
max,4.990057


In [11]:
dtype_df_items = {
"page" : 'string',
"url" : 'string',
"issued" : 'string',
"modified" : 'string',
"title" : 'string',
"body" : 'string',
"caption" : 'string',
"age_in_days" : 'UInt32',
"age_exp" : 'Float32',
"age_exp_normalized" : 'Float32',
"ageCategories" : 'category'
}

In [12]:
df_news = pd.read_csv(config["DF_ITEMS_FEATURE"], dtype=dtype_df_items)
df_news.drop(columns=["Unnamed: 0"],inplace=True)
df_news.head(3)

Unnamed: 0,page,age_exp_normalized,ageCategories
0,7371a9b5-5824-4c57-8704-00a74feebe79,0.151439,very-old
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,0.140788,very-old
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,0.12261,very-old


In [13]:
df_merged = pd.merge(df_train_cleaned, df_news, left_on='history', right_on='page', how='left')
df_merged.drop(columns=["page"],inplace=True) # 'page' is the same as 'history'
df_merged.head()

Unnamed: 0,userId,history,userType,score,age_exp_normalized,ageCategories
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477,0.980416,recent
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501,0.613061,mid
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861,0.880859,recent
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,1.68271,0.945895,recent
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,2.266852,0.13293,very-old


In [14]:
df_merged["check_hifens"] = df_merged["history"].str.split('-').apply(lambda x : len(x) == 5).astype("bool")
df_merged["check_size"] = df_merged["history"].str.replace(r'-', '', regex=True).apply(lambda x : len(x) == 32).astype("bool")
df_merged["check_chars"] = df_merged["history"].str.replace(r'-', '', regex=True).str.replace(r'[a-f0-9]', '', regex=True).apply(lambda x : x == '').astype("bool")
df_merged["check_history"] = (df_merged["check_hifens"] & df_merged["check_size"] & df_merged["check_chars"]).astype("bool")
df_merged["check_history"].value_counts()

check_history
True     6335315
False      14581
Name: count, dtype: int64

In [15]:
df_merged = df_merged[~df_merged["check_history"]==False]
df_merged

Unnamed: 0,userId,history,userType,score,age_exp_normalized,ageCategories,check_hifens,check_size,check_chars,check_history
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,2.216477,0.980416,recent,True,True,True,True
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,2.429501,0.613061,mid,True,True,True,True
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,1.794861,0.880859,recent,True,True,True,True
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,1.68271,0.945895,recent,True,True,True,True
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,2.266852,0.13293,very-old,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...
6349891,5889d6ebbf62e6c115e0a280063dc8189cca490cbfea56...,7a349b09-badc-40a9-a194-83d959aeb50c,Non-Logged,1.94055,0.966442,recent,True,True,True,True
6349892,5889d6ebbf62e6c115e0a280063dc8189cca490cbfea56...,6f344c45-e731-41b4-8c65-9967ebc03096,Non-Logged,3.361101,0.937478,recent,True,True,True,True
6349893,5889d6ebbf62e6c115e0a280063dc8189cca490cbfea56...,4c586bb4-f71d-4b39-9df8-e38ac3f632a0,Non-Logged,0.919598,0.939154,recent,True,True,True,True
6349894,5889d6ebbf62e6c115e0a280063dc8189cca490cbfea56...,855d20b7-53f2-4678-a10f-55402d085018,Non-Logged,1.990197,0.929145,recent,True,True,True,True


In [16]:
w_initial_score = 0.8
w_age_norm = 1-w_initial_score
df_merged["score_init"] = df_merged["score"]*w_initial_score + df_merged["age_exp_normalized"]*w_age_norm
df_merged["score_fn"] = df_merged["score_init"].apply(lambda x: np.power(x,0.33))
df_merged["score_norm"] = (df_merged["score_fn"]-df_merged["score_fn"].min())/(df_merged["score_fn"].max()-df_merged["score_fn"].min())
df_merged["score_1_to_5"] = df_merged["score_norm"]*4+1
df_merged["score_1_to_5_int"] = df_merged["score_1_to_5"].round(0).astype("UInt16")
df_merged.describe()

Unnamed: 0,score,age_exp_normalized,score_init,score_fn,score_norm,score_1_to_5,score_1_to_5_int
count,6335315.0,6335315.0,6335315.0,6335315.0,6335315.0,6335315.0,6335315.0
mean,1.608747,0.902338,1.467463,1.102514,0.5808968,3.323587,3.32936
std,0.867371,0.180707,0.697944,0.1925553,0.1608489,0.6433958,0.702031
min,0.002373,0.106769,0.065662,0.407111,0.0,1.0,1.0
25%,0.924811,0.934135,0.916307,0.9715689,0.4715136,2.886054,3.0
50%,1.567538,0.956106,1.434961,1.126567,0.6009899,3.40396,3.0
75%,2.203038,0.978657,1.946347,1.245784,0.7005761,3.802304,4.0
max,4.990057,1.0,4.188129,1.60423,1.0,5.0,5.0


In [17]:
column_score_to_use = "score_norm"
df_merged = df_merged[["userId","history","userType",column_score_to_use]]
df_merged.rename(columns={column_score_to_use:"score"},inplace=True)

df_merged.rename(columns={"ageCategories" : "historyFreshnessNormalized"},inplace=True)
df_merged.describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged.rename(columns={column_score_to_use:"score"},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged.rename(columns={"ageCategories" : "historyFreshnessNormalized"},inplace=True)


Unnamed: 0,score
count,6335315.0
mean,0.5808968
std,0.1608489
min,0.0
25%,0.4715136
50%,0.6009899
75%,0.7005761
max,1.0


In [18]:
df_merged.head()

Unnamed: 0,userId,history,userType,score
0,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,80aa7bb2-adce-4a55-9711-912c407927a1,Non-Logged,0.704604
1,fbb963d61eb8149e7f43b1bd905457ba5e106a830ddc27...,d9e5f15d-b441-4d8b-bee4-462b106d3916,Non-Logged,0.721303
2,17f1083e6079b0f28f7820a6803583d1c1b405c0718b11...,e273dba4-136c-45fb-bdd6-0cc57b13aaf0,Non-Logged,0.637834
3,528a8d7a2af73101da8d6709c1ec875b449a5a58749a99...,a0562805-c7d1-4ffd-b622-87c50ae006f4,Non-Logged,0.622225
4,2dd18b58a634a4e77181a202cf152df6169dfb3e4230ef...,233f8238-2ce0-470f-a9d5-0e0ac530382a,Non-Logged,0.68147


In [20]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6335315 entries, 0 to 6349895
Data columns (total 4 columns):
 #   Column    Dtype   
---  ------    -----   
 0   userId    string  
 1   history   string  
 2   userType  category
 3   score     float64 
dtypes: category(1), float64(1), string(2)
memory usage: 199.4 MB


In [21]:
df_merged.to_csv(config["DF_TRAIN_SCORES"])