**MANIPULACIÓN Y LIMPIEZA DE DATOS FICTICIOS**

In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from google.colab import files
uploaded = files.upload()

#Cargar el dataset
df = pd.read_csv("data_emociones_faker.csv")

#Revisar las primeras filas
print(df.head())

#Revisar los tipos de datos
print(df.dtypes)

#Revisar info general
print(df.info())

#Revisar valores nulos
print(df.isnull().sum())

#Revisar duplicados
print(df.duplicated().sum())

#Revisar estadísticas descriptivas
print(df.describe())

Saving data_emociones_faker.csv to data_emociones_faker.csv
                                user_id            timestamp  \
0  1433c352-f2d7-45d3-bb9a-db6f2b10507a  2025-10-03 18:43:07   
1  b2d701a8-818e-4801-b2a3-4d3b1b640014  2024-11-12 08:55:02   
2  70952dab-222d-4a7d-91de-c5c761423796  2025-04-19 12:54:04   
3  ff0656ec-d60c-4369-9b30-dbd595aa8c24  2025-08-15 08:53:57   
4  6fc6ff65-102b-466a-8676-4fc23474c5a4  2026-01-11 17:59:58   

                                                text      emotion  age  \
0                              Class ahead key meet.        feliz   33   
1  Might exactly seat art book choice trouble sho...        feliz   35   
2    If along argue food shoulder report technology.  sorprendido   46   
3  And authority concern usually religious claim ...  sorprendido   55   
4  Thank day get ago drug draw month especially R...        feliz   24   

       gender region  
0  no binario  Oeste  
1    femenino    Sur  
2   masculino    Sur  
3  no binario  Nor

In [4]:
#Eliminar duplicados
df = df.drop_duplicates()

#Convertir la columna timestamp a tipo datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

#Filtrar edades válidas(opcional)
df = df[(df['age'] >= 0) & (df['age'] <= 120)]

In [6]:
#Definir columnas categóricas
categorical_cols = ["gender", "region"]

#Codificación One_Hot (elimina la primera categoría para evitar multicolinealidad)
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cats = encoder.fit_transform(df[categorical_cols])

# Crear DataFrame con columnas codificadas
encoded_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

# Combinar con DataFrame original y eliminar columnas originales
df = pd.concat([df.drop(categorical_cols, axis=1), encoded_df], axis=1)

In [7]:
# Función para limpiar texto
def clean_text(text):
    text = text.lower()                             # pasar a minúsculas
    text = re.sub(r'[^a-záéíóúñü\s]', '', text)    # eliminar puntuación y caracteres especiales
    text = re.sub(r'\s+', ' ', text).strip()       # eliminar espacios extra
    return text

    #Aplicar limpieza
    df["text_clean"] = df["text_clean"].apply(clean_text)

    #Revisar resultados
    print(df[["text", "text_clean"]].head())

In [8]:
#Escalar variables numéricas
num_cols = ['age']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [11]:
# Guardar CSV en Colab
df.to_csv('data_clean_emotions.csv', index=False)

# Descargar a tu PC
from google.colab import files
files.download('data_clean_emotions.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>