### Imports

In [1]:
import json

with open('config.json', 'r') as f:
    config = json.load(f)

In [2]:
import pandas as pd

### Optimized Schema for Reading CSV

In [3]:
dtype_df_items = {
"page" : 'string',
"url" : 'string',
"issued" : 'string',
"modified" : 'string',
"title" : 'string',
"body" : 'string',
"caption" : 'string'
}

### Reading News (Items) DataFrame from CSV

In [4]:
df_news = pd.read_csv(config["TREATED_ITEMS_CSV"], dtype=dtype_df_items)
df_news.drop(columns=["Unnamed: 0"],inplace=True)

In [5]:
df_news.head(2)

Unnamed: 0,page,url,issued,modified,title,body,caption
0,7371a9b5-5824-4c57-8704-00a74feebe79,http://g1.globo.com/al/alagoas/noticia/2018/09...,2018-09-13 14:52:55+00:00,2018-09-14 16:14:49+00:00,Corpo de motorista da Uber é encontrado em can...,Corpo de motorista de aplicativo desaparecido ...,"Segundo a polícia, Antônio Vitor foi solicitad..."
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,http://g1.globo.com/am/amazonas/noticia/detent...,2018-05-20 20:42:40+00:00,2018-05-20 20:42:40+00:00,Detento recapturado após fuga por túnel volta ...,Detento disse que passou nome falso ao dar ent...,Ele tinha registro em presídio com nome falso....


### Get datetime "data" column from "issued"

In [6]:
import numpy as np

# Convert the string column to datetime
df_news["data"] = pd.to_datetime(df_news["issued"])

In [7]:
df_news.head(2)

Unnamed: 0,page,url,issued,modified,title,body,caption,data
0,7371a9b5-5824-4c57-8704-00a74feebe79,http://g1.globo.com/al/alagoas/noticia/2018/09...,2018-09-13 14:52:55+00:00,2018-09-14 16:14:49+00:00,Corpo de motorista da Uber é encontrado em can...,Corpo de motorista de aplicativo desaparecido ...,"Segundo a polícia, Antônio Vitor foi solicitad...",2018-09-13 14:52:55+00:00
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,http://g1.globo.com/am/amazonas/noticia/detent...,2018-05-20 20:42:40+00:00,2018-05-20 20:42:40+00:00,Detento recapturado após fuga por túnel volta ...,Detento disse que passou nome falso ao dar ent...,Ele tinha registro em presídio com nome falso....,2018-05-20 20:42:40+00:00


### Check the oldest history, in days

In [8]:
# Find the most recent date
most_recent_date = df_news["data"].max()

# Calculate the difference in days
df_news["age_in_days"] = (most_recent_date - df_news["data"]).dt.days

In [9]:
most_recent_date

Timestamp('2022-08-15 02:57:16+0000', tz='UTC')

In [10]:
df_news["age_in_days"].describe()

count    255603.000000
mean        559.549724
std         546.845411
min           0.000000
25%         104.000000
50%         358.000000
75%         909.000000
max        2936.000000
Name: age_in_days, dtype: float64

### Defining a Time Constant for caracterizing exponential behavior
* We consider news having a decaing exponential behavior (for people attention): `age_exp = exp(-age/time_constant)`
* The `time_constant` is defined such that at `max(age_in_days)` we have approximatly `6*time_constants` (99,9% of decay)
* Assuming that `max(age_in_days) \approx 3000`, then we reach `time_constant = 500`.


In [11]:
# max_news_age = df_news_clusterizada["age_in_days"].max()
max_news_age = 3000
settling_time_taus_factor = 6
time_constant = max_news_age/settling_time_taus_factor
print(max_news_age, time_constant)

3000 500.0


### Apply exponential transformation to obtain `age_exp`

In [12]:
# Assign weights using exponential decay (you can adjust the base for scaling)
df_news["age_exp"] = np.exp(-df_news["age_in_days"] / time_constant).astype('Float32')

### Apply normalization to range from 0.1 to 1, obtaining `age_exp_normalized`

In [13]:
# Adjust range to [0.5, 1]
new_min = 0.1
new_max = 1
df_news["age_exp_normalized"] = (df_news["age_exp"] * (new_max - new_min) + new_min).astype('Float32')


In [14]:
df_news.head(5)

Unnamed: 0,page,url,issued,modified,title,body,caption,data,age_in_days,age_exp,age_exp_normalized
0,7371a9b5-5824-4c57-8704-00a74feebe79,http://g1.globo.com/al/alagoas/noticia/2018/09...,2018-09-13 14:52:55+00:00,2018-09-14 16:14:49+00:00,Corpo de motorista da Uber é encontrado em can...,Corpo de motorista de aplicativo desaparecido ...,"Segundo a polícia, Antônio Vitor foi solicitad...",2018-09-13 14:52:55+00:00,1431,0.057154,0.151439
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,http://g1.globo.com/am/amazonas/noticia/detent...,2018-05-20 20:42:40+00:00,2018-05-20 20:42:40+00:00,Detento recapturado após fuga por túnel volta ...,Detento disse que passou nome falso ao dar ent...,Ele tinha registro em presídio com nome falso....,2018-05-20 20:42:40+00:00,1547,0.04532,0.140788
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,http://g1.globo.com/ap/amapa/noticia/audios-mo...,2017-07-30 00:37:17+00:00,2017-07-30 00:48:42+00:00,Áudios mostram conversa entre bandidos durante...,Áudios mostram possível conversa entre bandido...,Revista realizada na sexta-feira (28) no Iapen...,2017-07-30 00:37:17+00:00,1842,0.025122,0.12261
3,5cc3bd27-80c7-457d-a807-2e8e7fddf031,http://g1.globo.com/ap/amapa/noticia/2020/11/0...,2020-11-06 12:54:00+00:00,2020-11-12 21:22:52+00:00,FOTOS: Apagão no Amapá,"Moradores da capital do Amapá, em Macapá, faze...",Incêndio em subestação de energia deixa 13 dos...,2020-11-06 12:54:00+00:00,646,0.274721,0.347249
4,d6956177-db96-42f5-9f68-dd0d6e930661,http://g1.globo.com/ap/amapa/noticia/2019/05/2...,2019-05-27 13:43:03+00:00,2019-05-27 18:19:06+00:00,Profissionais da educação no AP paralisam ativ...,Profissionais da educação paralisam atividades...,Ato comprometeu aulas em escolas nesta segunda...,2019-05-27 13:43:03+00:00,1175,0.095369,0.185832


In [15]:
df_news["age_in_days"] = df_news["age_in_days"].astype("UInt32")

In [16]:
df_news["ageCategories"] = pd.cut(df_news['age_exp_normalized'], bins=[0, 0.25, 0.5, 0.8, 1.1], include_lowest=True, labels=['very-old', 'old', 'mid', 'recent'])

In [17]:
df_news.head(5)

Unnamed: 0,page,url,issued,modified,title,body,caption,data,age_in_days,age_exp,age_exp_normalized,ageCategories
0,7371a9b5-5824-4c57-8704-00a74feebe79,http://g1.globo.com/al/alagoas/noticia/2018/09...,2018-09-13 14:52:55+00:00,2018-09-14 16:14:49+00:00,Corpo de motorista da Uber é encontrado em can...,Corpo de motorista de aplicativo desaparecido ...,"Segundo a polícia, Antônio Vitor foi solicitad...",2018-09-13 14:52:55+00:00,1431,0.057154,0.151439,very-old
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,http://g1.globo.com/am/amazonas/noticia/detent...,2018-05-20 20:42:40+00:00,2018-05-20 20:42:40+00:00,Detento recapturado após fuga por túnel volta ...,Detento disse que passou nome falso ao dar ent...,Ele tinha registro em presídio com nome falso....,2018-05-20 20:42:40+00:00,1547,0.04532,0.140788,very-old
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,http://g1.globo.com/ap/amapa/noticia/audios-mo...,2017-07-30 00:37:17+00:00,2017-07-30 00:48:42+00:00,Áudios mostram conversa entre bandidos durante...,Áudios mostram possível conversa entre bandido...,Revista realizada na sexta-feira (28) no Iapen...,2017-07-30 00:37:17+00:00,1842,0.025122,0.12261,very-old
3,5cc3bd27-80c7-457d-a807-2e8e7fddf031,http://g1.globo.com/ap/amapa/noticia/2020/11/0...,2020-11-06 12:54:00+00:00,2020-11-12 21:22:52+00:00,FOTOS: Apagão no Amapá,"Moradores da capital do Amapá, em Macapá, faze...",Incêndio em subestação de energia deixa 13 dos...,2020-11-06 12:54:00+00:00,646,0.274721,0.347249,old
4,d6956177-db96-42f5-9f68-dd0d6e930661,http://g1.globo.com/ap/amapa/noticia/2019/05/2...,2019-05-27 13:43:03+00:00,2019-05-27 18:19:06+00:00,Profissionais da educação no AP paralisam ativ...,Profissionais da educação paralisam atividades...,Ato comprometeu aulas em escolas nesta segunda...,2019-05-27 13:43:03+00:00,1175,0.095369,0.185832,very-old


In [18]:
df_news.describe()

Unnamed: 0,age_in_days,age_exp,age_exp_normalized
count,255603.0,255603.0,255603.0
mean,559.549724,0.492212,0.542991
std,546.845411,0.329121,0.296209
min,0.0,0.002817,0.102536
25%,104.0,0.16235,0.246115
50%,358.0,0.488703,0.539833
75%,909.0,0.812207,0.830986
max,2936.0,1.0,1.0


In [19]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255603 entries, 0 to 255602
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   page                255603 non-null  string             
 1   url                 255603 non-null  string             
 2   issued              255603 non-null  string             
 3   modified            255603 non-null  string             
 4   title               255603 non-null  string             
 5   body                255603 non-null  string             
 6   caption             255603 non-null  string             
 7   data                255603 non-null  datetime64[ns, UTC]
 8   age_in_days         255603 non-null  UInt32             
 9   age_exp             255603 non-null  Float32            
 10  age_exp_normalized  255603 non-null  Float32            
 11  ageCategories       255603 non-null  category           
dtypes: Float32(2), U

In [20]:
unused_items_columns = ["url", "issued", "modified", "title", "body", "caption", "data", "age_in_days", "age_exp"]
df_news.drop(columns=unused_items_columns,inplace=True)

In [21]:
df_news

Unnamed: 0,page,age_exp_normalized,ageCategories
0,7371a9b5-5824-4c57-8704-00a74feebe79,0.151439,very-old
1,7a5ea08f-4583-49e2-ba52-a71999443f7b,0.140788,very-old
2,6afc8bbb-4f36-43d5-8a44-a2917df5621a,0.12261,very-old
3,5cc3bd27-80c7-457d-a807-2e8e7fddf031,0.347249,old
4,d6956177-db96-42f5-9f68-dd0d6e930661,0.185832,very-old
...,...,...,...
255598,943d6ee8-9daa-4a26-9e42-763339bedba6,0.841292,recent
255599,d21c1bfc-6a90-4e2d-8c4c-ff1daee1b4f2,0.21939,very-old
255600,abc5ecd9-81e1-40cf-b706-0b5fa7bea3be,0.145168,very-old
255601,a7cf134e-0bb0-4363-9224-4d68cf8c7a53,0.145897,very-old


### Save to CSV

In [22]:
df_news.to_csv(config["DF_ITEMS_FEATURE"])