In [0]:
# ============================================
# 02 SILVER CLEANING - PREPARAR DATOS PARA EMBEDDINGS
# ============================================
# Proyecto: Startup Death Oracle
# Autor: Sergio Rincon
# Fecha: 2025-12-02
# ============================================

import pandas as pd

# Ruta de datos
silver_path = "/data/silver/"

# Cargar datasets
df_failures = pd.read_csv(silver_path + "failures_clean.csv")
df_success = pd.read_csv(silver_path + "startups_data_clean.csv")

print("DATOS CARGADOS:")
print("=" * 50)
print(f"Failures: {len(df_failures)} registros")
print(f"Success:  {len(df_success)} registros")

DATOS CARGADOS:
Failures: 409 registros
Success:  923 registros


In [0]:
# Combinar campos de texto en una sola columna
df_failures['text_combined'] = (
    "Company: " + df_failures['Name']. fillna('') + ".  " +
    "Sector: " + df_failures['Sector'].fillna('') + ". " +
    "What they did: " + df_failures['What They Did'].fillna('') + ". " +
    "Why they failed: " + df_failures['Why They Failed'].fillna('') + ". " +
    "Lesson: " + df_failures['Takeaway'].fillna('')
)

print("TEXTO COMBINADO - EJEMPLO:")
print("=" * 50)
print(df_failures['text_combined']. iloc[0])
print("\n" + "=" * 50)
print(f"Longitud promedio: {df_failures['text_combined'].str.len().mean():.0f} caracteres")

TEXTO COMBINADO - EJEMPLO:
Company: Aira Health.  Sector: Health Care. What they did: Personalized asthma/allergy app. Why they failed: Small user base and cash shortage. Lesson: Niche apps need big audiences

Longitud promedio: 168 caracteres


Limpiar y normalizar texto

In [0]:
import re

def limpiar_texto(texto):
    if pd.isna(texto):
        return ""
    # Convertir a minusculas
    texto = texto.lower()
    # Eliminar caracteres especiales (mantener letras, numeros, espacios)
    texto = re.sub(r'[^a-z0-9\s\.\,]', '', texto)
    # Eliminar espacios multiples
    texto = re.sub(r'\s+', ' ', texto)
    return texto.strip()

# Aplicar limpieza
df_failures['text_clean'] = df_failures['text_combined']. apply(limpiar_texto)

print("TEXTO LIMPIO - EJEMPLO:")
print("=" * 50)
print(df_failures['text_clean'].iloc[0])
print("\n" + "=" * 50)
print("ANTES vs DESPUES:")
print(f"Original:  {len(df_failures['text_combined'].iloc[0])} caracteres")
print(f"Limpio:    {len(df_failures['text_clean'].iloc[0])} caracteres")

TEXTO LIMPIO - EJEMPLO:
company aira health. sector health care. what they did personalized asthmaallergy app. why they failed small user base and cash shortage. lesson niche apps need big audiences

ANTES vs DESPUES:
Original:  181 caracteres
Limpio:    174 caracteres


Ahora agregamos causas de muerte como texto

In [0]:
# Columnas de causas (flags 0/1)
causa_cols = [
    'Giants', 'No Budget', 'Competition', 'Poor Market Fit',
    'Acquisition Stagnation', 'Platform Dependency', 'Monetization Failure',
    'Niche Limits', 'Execution Flaws', 'Trend Shifts',
    'Toxicity/Trust Issues', 'Regulatory Pressure', 'Overhype',
    'High Operational Costs'
]

# Funcion para extraer causas activas
def extraer_causas(row):
    causas = []
    for col in causa_cols:
        if col in row.index and row[col] == 1:
            causas.append(col. lower(). replace(' ', '_'))
    if causas:
        return "causes: " + ", ".join(causas)
    return ""

# Aplicar
df_failures['causes_text'] = df_failures. apply(extraer_causas, axis=1)

# Combinar texto limpio + causas
df_failures['text_final'] = df_failures['text_clean'] + ". " + df_failures['causes_text']

print("TEXTO FINAL - EJEMPLO:")
print("=" * 50)
print(df_failures['text_final']. iloc[0])
print("\n" + "=" * 50)
print(f"Registros con causas: {(df_failures['causes_text'] != '').sum()}")

TEXTO FINAL - EJEMPLO:
company aira health. sector health care. what they did personalized asthmaallergy app. why they failed small user base and cash shortage. lesson niche apps need big audiences. causes: no_budget, competition, poor_market_fit, monetization_failure, niche_limits

Registros con causas: 409


In [0]:
gold_path = "/data/gold/"

# Seleccionar columnas relevantes para embeddings
df_embeddings = df_failures[[
    'Name', 
    'Sector', 
    'Years of Operation',
    'How Much They Raised',
    'text_final'
]].copy()

# Renombrar columnas
df_embeddings. columns = ['name', 'sector', 'years', 'funding', 'text']

# Guardar
df_embeddings. to_csv(gold_path + "failures_for_embeddings.csv", index=False)

print("GUARDADO EN GOLD LAYER:")
print("=" * 50)
print(f"Archivo: failures_for_embeddings.csv")
print(f"Registros: {len(df_embeddings)}")
print(f"Columnas: {list(df_embeddings.columns)}")
print("\n" + "=" * 50)
print("PREVIEW:")
df_embeddings.head(3)

GUARDADO EN GOLD LAYER:
Archivo: failures_for_embeddings.csv
Registros: 409
Columnas: ['name', 'sector', 'years', 'funding', 'text']

PREVIEW:


Unnamed: 0,name,sector,years,funding,text
0,Aira Health,Health Care,2015-2019,$12M,company aira health. sector health care. what ...
1,Amino,Health Care,2013-2021,$45M,company amino. sector health care. what they d...
2,Arivale,Health Care,2015-2019,$50M,company arivale. sector health care. what they...


# 02_Silver_Cleaning

## Proceso realizado

1. **Texto combinado**: Unificamos Name, Sector, What They Did, Why They Failed y Takeaway en un solo campo
2. **Limpieza**: Minusculas, eliminacion de caracteres especiales, espacios multiples
3. **Enriquecimiento**: Convertimos los flags de causas de muerte (columnas con 0/1) a texto legible

## Flags de causas de muerte

El CSV original tiene columnas como `Giants`, `No Budget`, `Competition`, etc. con valores 0 o 1.

| Name        | No Budget | Competition | Poor Market Fit |
|-------------|-----------|-------------|-----------------|
| Aira Health | 1         | 1           | 1               |

**Lo convertimos a texto para que el modelo de embeddings lo entienda:**

`"causes: no_budget, competition, poor_market_fit"`

## Ejemplo de texto final

company aira health. sector health care. what they did personalized asthmaallergy app. why they failed small user base and cash shortage. lesson niche apps need big audiences.
causes: no_budget, competition, poor_market_fit, monetization_failure, niche_limits

Code

## Output
`/data/gold/failures_for_embeddings.csv` - 409 registros listos para embeddings