In [14]:
import pandas
import json
import h2o
import os
import random
from pathlib import Path
from h2o.automl import H2OAutoML
from naive_model import NaiveModel

In [15]:
def load_articles() -> list:
    articles = []
    for file_path in os.listdir("./data"):
        try:
            data_page = open(Path("data", file_path))
            articles_by_page = json.load(data_page)
            articles = articles + articles_by_page
        except:
            print(f"No se pudo cargar el arcivo: {file_path}")
    
    return articles

In [16]:
def preprocess_text(text: str):
    return []

# LOAD DATASET

In [17]:
dataframe = pandas.DataFrame(load_articles())

# ADD NEW COLUMNS TO DATAFRAME 

In [18]:
dataframe['date'] = pandas.to_datetime(dataframe['date'], errors="coerce")
dataframe['only_date'] = dataframe['date'].dt.date
dataframe['only_weekday'] = dataframe['date'].dt.isocalendar().week

In [19]:
dataframe.groupby(['only_weekday'])['only_weekday'].count()

only_weekday
31    21
32    22
33    69
34    64
Name: only_weekday, dtype: int64

Se decide tomar N registros de manera random para cada semana de articulos

In [20]:
def get_random_articles_by_weekday(df: pandas.DataFrame):
    weekdays = df['only_weekday'].unique().tolist()
    total_articles = pandas.DataFrame()
    
    for weekday in weekdays:
        df_articles = df.loc[df['only_weekday'] == weekday]
        indexes = random.sample(df_articles.index.values.tolist(), 5)
        df_articles = df_articles.filter(items=indexes, axis=0)
        
        total_articles = pandas.concat([total_articles, df_articles])
    
    return total_articles

In [21]:
def index_for_train(test_indexes, all_indexes):
    indexes = [train_index for train_index in all_indexes if train_index not in test_indexes]
    return indexes

In [22]:
dataframe_article = get_random_articles_by_weekday(dataframe)

dataframe_article_index_list = dataframe_article.index.to_list()
dataframe_index_list = dataframe.index.to_list()

index_train = index_for_train(dataframe_article_index_list, dataframe_index_list)

dataframe_article_train = dataframe.filter(items=index_train, axis=0)

In [23]:
train_X = dataframe_article_train['description']
train_y = dataframe_article_train['category']

print(train_X.shape)
print(train_y.shape)

(156,)
(156,)


In [24]:
test_X = dataframe_article['description']
test_y = dataframe_article['category']

print(test_X.shape)
print(test_y.shape)

(20,)
(20,)


In [27]:
naive_model = NaiveModel(dataframe_article_train, dataframe_article)
naive_model.prepare_train()
naive_bayes_classifier = naive_model.get_model()
naive_bayes_classifier


train time: 0.002s
test time:  0.009s
accuracy:   0.900
              precision    recall  f1-score   support

    Positive       0.93      0.93      0.93        14
    Negative       0.83      0.83      0.83         6

    accuracy                           0.90        20
   macro avg       0.88      0.88      0.88        20
weighted avg       0.90      0.90      0.90        20

confusion matrix:
[[13  1]
 [ 1  5]]
------------------------------


In [None]:
# example source https://iq.opengenus.org/naive-bayes-on-tf-idf-vectorized-matrix/