In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from app.utilities.utils import print_dataset_info, save_dataset

In [None]:
data = pd.read_csv("data/dataset_items.csv")
data["ITE_ITEM_DOM_DOMAIN_ID"] = [value.split("-")[1] for value in data["ITE_ITEM_DOM_DOMAIN_ID"]]
data.rename(columns={"ITE_ITEM_TITLE": "item",
                     "ITE_ITEM_DOM_DOMAIN_ID": "domain"}, inplace=True)
data.head(n=5)

In [None]:
data.describe(exclude="number")

In [None]:
duplicates = data[data["item"] == 'Kit Make Bloquinho De Carnaval Jequiti 1 Un']
duplicates

In [None]:
data.drop_duplicates(subset=["item"], keep=False, inplace=True)

# Outlier

In [None]:
longitudes = [len(sequence.split()) for sequence in data["item"]]

print("Longitud maxima:", np.max(longitudes))
print("Longitud media:", np.mean(longitudes))
print("Longitud mediana:", np.median(longitudes))
print("Desviación estándar:", np.std(longitudes))

In [None]:
plt.figure(figsize=(20, 12))

plt.hist(longitudes)
plt.xlabel("Longitud de la secuencia")
plt.ylabel("Frecuencia")
plt.show()

In [None]:
sample = data.groupby(by='domain', group_keys=False).apply(lambda d: d.sample(50))

In [None]:
sample["domain"].value_counts()

In [None]:
csv_kwargs = {"index": False}

save_dataset(dataframe=sample, filepath="data/inference.csv", file_type="csv", **csv_kwargs)

In [None]:
data.drop(index=sample.index, inplace=True)

In [None]:
label_col = "domain"

train_data, test_data = train_test_split(data, test_size=0.3)

test_data, validation_data = train_test_split(test_data, test_size=0.3)


print(f"El set de entrenamiento tiene {len(train_data)} observaciones.")
print_dataset_info(data=train_data, column=label_col)
print("--" * 50)

print(f"El set de validacion tiene {len(validation_data)} observaciones.")
print_dataset_info(data=validation_data, column=label_col)
print("--" * 50)

print(f"El set de test tiene {len(test_data)} observaciones.")
print_dataset_info(data=test_data, column=label_col)
print("--" * 50)

In [None]:
save_dataset(dataframe=train_data, filepath="data/train.csv", file_type="csv", **csv_kwargs)
save_dataset(dataframe=validation_data, filepath="data/validation.csv", file_type="csv", **csv_kwargs)
save_dataset(dataframe=test_data, filepath="data/test.csv", file_type="csv", **csv_kwargs)