In [2]:
import sklearn, matplotlib, numpy as np
print("sklearn:", sklearn.__version__)
print("numpy:", np.__version__)
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
print("✅ OK imports model_selection")


sklearn: 1.7.1
numpy: 1.26.4
✅ OK imports model_selection


In [3]:
import pandas as pd

df = pd.read_csv("telecom_churn_dataset.csv")
print(df.shape)
df.head()

(1500, 1)


Unnamed: 0,customer_id;signup_date;last_interaction_date;tenure_months;monthly_charge;total_charges;contract_type;payment_method;internet_service;support_tickets_30d;num_services;promo_applied;region;device_type;avg_download_mbps;downtime_hrs_30d;late_payments_12m;nps_text;churn
0,CUST01117;5/07/2019;;75;74.1875996;5541.11;Mes...
1,CUST01369;6/07/2022;6/09/2024;38;47.76425832;1...
2,CUST00423;18/04/2020;16/01/2025;63;64.81747361...
3,CUST00414;1/01/2021;7/01/2024;57;59.82235691;3...
4,CUST00452;13/12/2020;18/01/2022;57;45.99929277...


In [4]:
import pandas as pd

# Especifica que el separador es ';'
df = pd.read_csv("telecom_churn_dataset.csv", sep=";")

print(df.shape)
df.head()


(1500, 19)


Unnamed: 0,customer_id,signup_date,last_interaction_date,tenure_months,monthly_charge,total_charges,contract_type,payment_method,internet_service,support_tickets_30d,num_services,promo_applied,region,device_type,avg_download_mbps,downtime_hrs_30d,late_payments_12m,nps_text,churn
0,CUST01117,5/07/2019,,75,74.1876,5541.11,Mes a mes,Efectivo,Cable,0,3,No,Centro,Router,148.49254,2.263775,0,Muy satisfecho,0
1,CUST01369,6/07/2022,6/09/2024,38,47.764258,1794.29,Mes a mes,Efectivo,Cable,0,2,No,Centro,Router,185.686865,1.737182,1,Podría mejorar,0
2,CUST00423,18/04/2020,16/01/2025,63,64.817474,4175.53,Mes a mes,Débito,Cable,1,5,No,Centro,ONT,127.55469,0.352482,1,Cobro incorrecto,0
3,CUST00414,1/01/2021,7/01/2024,57,59.822357,3469.83,2 años,Tarjeta,Fibra,1,2,No,Lima Metropolitana,Modem,269.204635,2.523062,0,Todo bien,0
4,CUST00452,13/12/2020,18/01/2022,57,45.999293,2621.87,Mes a mes,Efectivo,DSL,1,2,Sí,Lima Metropolitana,Router,42.82315,3.089018,2,Rápido y estable,1


In [5]:
import pandas as pd
import numpy as np

# Si tu df ya está en memoria, no vuelvas a leerlo.
# Aseguramos parseo de fechas (tu formato es dd/mm/yyyy)
for col in ["signup_date", "last_interaction_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce", dayfirst=True)

print("Shape:", df.shape)
print("\nDTypes:\n", df.dtypes)
print("\nNulos por columna:\n", df.isna().sum())

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("\nEstadísticos numéricos:\n", df[num_cols].describe().T.head(10))


Shape: (1500, 19)

DTypes:
 customer_id                      object
signup_date              datetime64[ns]
last_interaction_date    datetime64[ns]
tenure_months                     int64
monthly_charge                  float64
total_charges                   float64
contract_type                    object
payment_method                   object
internet_service                 object
support_tickets_30d               int64
num_services                      int64
promo_applied                    object
region                           object
device_type                      object
avg_download_mbps               float64
downtime_hrs_30d                float64
late_payments_12m                 int64
nps_text                         object
churn                             int64
dtype: object

Nulos por columna:
 customer_id               0
signup_date               0
last_interaction_date    40
tenure_months             0
monthly_charge           50
total_charges             0
contract_

In [19]:
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

# Define función en lugar de lambda
def select_text_col(X: pd.DataFrame) -> pd.Series:
    return X["nps_text"].fillna("")

# Crea el transformador
selector = FunctionTransformer(select_text_col, validate=False)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

text_pipeline = Pipeline([
    ("selector", selector),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])


In [22]:
# === 1) Reconstruir el ColumnTransformer sin lambdas y reentrenar ===
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib, re

# --- función en lugar de lambda ---
def select_text_col(X: pd.DataFrame) -> pd.Series:
    return X["nps_text"].fillna("")

# Pipelines por tipo
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

text_pipeline = Pipeline([
    ("selector", FunctionTransformer(select_text_col, validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

# ColumnTransformer nuevo (SIN lambda)
preprocess_fixed = ColumnTransformer(
    transformers=[
        ("num",  num_pipeline, numeric_features),
        ("cat",  cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# Elegir el mismo clasificador ganador si existe, sino LR
clf = LogisticRegression(max_iter=1000)
try:
    if best_name == "RandomForest":
        clf = RandomForestClassifier(
            n_estimators=300, random_state=42, class_weight="balanced_subsample", n_jobs=-1
        )
except NameError:
    pass

# Pipeline nuevo y ENTRENAR
best_pipe = Pipeline([("preprocess", preprocess_fixed), ("clf", clf)])
best_pipe.fit(X_train, y_train)

# --- Sanidad: verificar que NO queden lambdas en la representación ---
repr_text = repr(best_pipe)
print("¿Queda lambda?:", "<lambda>" in repr_text)

# === 2) Guardar con joblib (debe funcionar ahora) ===
joblib.dump(best_pipe, "model.joblib")
print("✅ Modelo guardado correctamente en model.joblib")


¿Queda lambda?: False
✅ Modelo guardado correctamente en model.joblib


In [27]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from datetime import date

# -------------------------------------------------
# 1) Funciones necesarias para cargar el modelo
#    (esta función existía cuando entrenaste el pipeline)
# -------------------------------------------------
def select_text_col(X: pd.DataFrame) -> pd.Series:
    # Mismo nombre/función que usaste al entrenar
    # Devuelve la columna de texto 'nps_text' con nulos como cadena vacía
    return X["nps_text"].fillna("")

# -------------------------------------------------
# 2) Cargar el pipeline entrenado
# -------------------------------------------------
@st.cache_resource
def load_model(path="model.joblib"):
    # Gracias a que definimos select_text_col arriba,
    # joblib puede deserializar el pipeline sin error.
    return joblib.load(path)

pipe = load_model()

# -------------------------------------------------
# 3) UI de Streamlit
#    Coincide con las FEATURES usadas al entrenar
#    (numéricas, categóricas y texto)
# -------------------------------------------------
st.set_page_config(page_title="Churn Telco", page_icon="📶", layout="centered")
st.title("📶 Predicción de Churn (Telco)")

st.caption("Modelo cargado desde **model.joblib**")

with st.form("form_inputs"):
    st.subheader("Datos del cliente")

    # Fechas para calcular 'days_since_*'
    signup_date = st.date_input("signup_date", value=date(2023, 1, 15))
    last_interaction_date = st.date_input("last_interaction_date", value=date.today())

    # Numéricas
    tenure_months = st.number_input("tenure_months", min_value=0, max_value=240, value=24, step=1)
    monthly_charge = st.number_input("monthly_charge", min_value=0.0, value=65.0, step=0.1)
    total_charges = st.number_input("total_charges", min_value=0.0, value=1560.0, step=0.1)
    support_tickets_30d = st.number_input("support_tickets_30d", min_value=0, max_value=30, value=1, step=1)
    num_services = st.number_input("num_services", min_value=1, max_value=5, value=3, step=1)
    avg_download_mbps = st.number_input("avg_download_mbps", min_value=1.0, value=180.0, step=1.0)
    downtime_hrs_30d = st.number_input("downtime_hrs_30d", min_value=0.0, value=1.5, step=0.1)
    late_payments_12m = st.number_input("late_payments_12m", min_value=0, max_value=24, value=0, step=1)

    # Categóricas (valores ejemplo; usa los que tengas en tu dataset)
    contract_type = st.selectbox("contract_type", ["Mes a mes", "1 año", "2 años"], index=0)
    payment_method = st.selectbox("payment_method", ["Tarjeta", "Débito", "Efectivo", "Billetera", "Cheque"], index=0)
    internet_service = st.selectbox("internet_service", ["Fibra", "Cable", "DSL", "Satélite"], index=0)
    promo_applied = st.selectbox("promo_applied", ["Sí", "No"], index=1)
    region = st.selectbox("region", ["Norte", "Centro", "Sur", "Oriente", "Lima Metropolitana"], index=4)
    device_type = st.selectbox("device_type", ["Modem", "Router", "Combo", "ONT", "Otro"], index=1)

    # Texto
    nps_text = st.text_input("nps_text (comentario breve)", "Todo bien")

    # Umbral
    threshold = st.slider("Umbral de decisión (churn si prob ≥ umbral)",
                          0.05, 0.95, 0.50, 0.01)

    submitted = st.form_submit_button("Predecir")

# -------------------------------------------------
# 4) Construcción de features como en el entrenamiento
#    (incluye derivadas days_since_signup / days_since_last_interaction)
# -------------------------------------------------
def build_features() -> pd.DataFrame:
    REFDATE = pd.Timestamp.today().normalize()
    s = pd.to_datetime(str(signup_date))
    li = pd.to_datetime(str(last_interaction_date))
    data = {
        # Numéricas originales
        "tenure_months": tenure_months,
        "monthly_charge": monthly_charge,
        "total_charges": total_charges,
        "support_tickets_30d": support_tickets_30d,
        "num_services": num_services,
        "avg_download_mbps": avg_download_mbps,
        "downtime_hrs_30d": downtime_hrs_30d,
        "late_payments_12m": late_payments_12m,
        # Derivadas de fecha
        "days_since_signup": (REFDATE - s).days if pd.notna(s) else None,
        "days_since_last_interaction": (REFDATE - li).days if pd.notna(li) else None,
        # Categóricas
        "contract_type": contract_type,
        "payment_method": payment_method,
        "internet_service": internet_service,
        "promo_applied": promo_applied,
        "region": region,
        "device_type": device_type,
        # Texto
        "nps_text": nps_text
    }
    return pd.DataFrame([data])

# -------------------------------------------------
# 5) Predicción
# -------------------------------------------------
if submitted:
    X_infer = build_features()
    proba = float(pipe.predict_proba(X_infer)[:, 1][0])
    pred = int(proba >= threshold)

    st.markdown("### Resultado")
    st.write(f"**Probabilidad de churn:** {proba:.3f}")
    st.write(f"**Predicción (umbral {threshold:.2f}):** {'Churn' if pred==1 else 'No churn'}")

    st.markdown("### Recomendaciones")
    recs = []
    if pred == 1:
        recs.append("- Ofrecer contrato de 1–2 años con beneficios (upgrade, descuento 3–6 meses).")
        recs.append("- Reducir downtime y TMR de soporte; ticket preventivo si hubo caídas.")
        recs.append("- Ajustar tarifa/paquete (bundle) y promociones personalizadas.")
        if late_payments_12m > 0:
            recs.append("- Recordatorios y fraccionamiento; incentivar pago automático.")
        if avg_download_mbps < 80:
            recs.append("- Proponer upgrade de plan/tecnología.")
    else:
        recs.append("- Cliente estable: habilitar cross-sell suave (upgrade de velocidad).")
        recs.append("- Mantener NPS con comunicaciones proactivas y estabilidad del servicio.")

    for r in recs:
        st.write(r)

st.caption("© Telco Churn Demo – Streamlit")


Overwriting app.py


In [5]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from datetime import date

def select_text_col(X: pd.DataFrame) -> pd.Series:
    return X["nps_text"].fillna("")

@st.cache_resource
def load_model(path="model.joblib"):
    return joblib.load(path)

pipe = load_model()

st.set_page_config(page_title="Churn Telco", page_icon="📶", layout="centered")
st.title("📶 Predicción de Churn (Telco)")

with st.form("form_inputs"):
    signup_date = st.date_input("signup_date", value=date(2023, 1, 15))
    last_interaction_date = st.date_input("last_interaction_date", value=date.today())

    tenure_months = st.number_input("tenure_months", 0, 240, 24, 1)
    monthly_charge = st.number_input("monthly_charge", 0.0, step=0.1, value=65.0)
    total_charges = st.number_input("total_charges", 0.0, step=0.1, value=1560.0)
    support_tickets_30d = st.number_input("support_tickets_30d", 0, 30, 1, 1)
    num_services = st.number_input("num_services", 1, 5, 3, 1)
    avg_download_mbps = st.number_input("avg_download_mbps", 1.0, step=1.0, value=180.0)
    downtime_hrs_30d = st.number_input("downtime_hrs_30d", 0.0, step=0.1, value=1.5)
    late_payments_12m = st.number_input("late_payments_12m", 0, 24, 0, 1)

    contract_type = st.selectbox("contract_type", ["Mes a mes","1 año","2 años"])
    payment_method = st.selectbox("payment_method", ["Tarjeta","Débito","Efectivo","Billetera","Cheque"])
    internet_service = st.selectbox("internet_service", ["Fibra","Cable","DSL","Satélite"])
    promo_applied = st.selectbox("promo_applied", ["Sí","No"], index=1)
    region = st.selectbox("region", ["Norte","Centro","Sur","Oriente","Lima Metropolitana"], index=4)
    device_type = st.selectbox("device_type", ["Modem","Router","Combo","ONT","Otro"], index=1)

    nps_text = st.text_input("nps_text (comentario breve)", "Todo bien")
    threshold = st.slider("Umbral de decisión (churn si prob ≥ umbral)", 0.05, 0.95, 0.50, 0.01)

    submitted = st.form_submit_button("Predecir")

def build_features() -> pd.DataFrame:
    REFDATE = pd.Timestamp.today().normalize()
    s = pd.to_datetime(str(signup_date))
    li = pd.to_datetime(str(last_interaction_date))
    return pd.DataFrame([{
        "tenure_months": tenure_months,
        "monthly_charge": monthly_charge,
        "total_charges": total_charges,
        "support_tickets_30d": support_tickets_30d,
        "num_services": num_services,
        "avg_download_mbps": avg_download_mbps,
        "downtime_hrs_30d": downtime_hrs_30d,
        "late_payments_12m": late_payments_12m,
        "days_since_signup": (pd.Timestamp.today().normalize() - s).days,
        "days_since_last_interaction": (pd.Timestamp.today().normalize() - li).days,
        "contract_type": contract_type,
        "payment_method": payment_method,
        "internet_service": internet_service,
        "promo_applied": promo_applied,
        "region": region,
        "device_type": device_type,
        "nps_text": nps_text
    }])

if submitted:
    X_infer = build_features()
    proba = float(pipe.predict_proba(X_infer)[:,1][0])
    pred = int(proba >= threshold)

    st.markdown("### Resultado")
    st.write(f"**Probabilidad de churn:** {proba:.3f}")
    st.write(f"**Predicción (umbral {threshold:.2f}):** {'Churn' if pred==1 else 'No churn'}")

    st.markdown("### Recomendaciones")
    recs = []
    if pred == 1:
        recs += [
            "- Ofrecer contrato 1–2 años con beneficios.",
            "- Reducir downtime y TMR de soporte; acción preventiva.",
            "- Ajustar tarifa/paquete y promociones personalizadas."
        ]
        if late_payments_12m > 0: recs.append("- Recordatorios / fraccionamiento; pago automático.")
        if avg_download_mbps < 80: recs.append("- Proponer upgrade de plan/tecnología.")
    else:
        recs += [
            "- Cliente estable: cross-sell suave (upgrade de velocidad).",
            "- Mantener NPS con comunicaciones proactivas."
        ]
    for r in recs: st.write(r)

st.caption("© Telco Churn Demo – Streamlit")


Writing app.py


In [9]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from datetime import date

# -------------------------------------------------
# 1) Funciones necesarias para cargar el modelo
#    (esta función existía cuando entrenaste el pipeline)
# -------------------------------------------------
def select_text_col(X: pd.DataFrame) -> pd.Series:
    # Mismo nombre/función que usaste al entrenar
    # Devuelve la columna de texto 'nps_text' con nulos como cadena vacía
    return X["nps_text"].fillna("")

# -------------------------------------------------
# 2) Cargar el pipeline entrenado
# -------------------------------------------------
@st.cache_resource
def load_model(path="model.joblib"):
    # Gracias a que definimos select_text_col arriba,
    # joblib puede deserializar el pipeline sin error.
    return joblib.load(path)

pipe = load_model()

# -------------------------------------------------
# 3) UI de Streamlit
#    Coincide con las FEATURES usadas al entrenar
#    (numéricas, categóricas y texto)
# -------------------------------------------------
st.set_page_config(page_title="Churn Telco", page_icon="📶", layout="centered")
st.title("📶 Predicción de Churn (Telco)")

st.caption("Modelo cargado desde **model.joblib**")

with st.form("form_inputs"):
    st.subheader("Datos del cliente")

    # Fechas para calcular 'days_since_*'
    signup_date = st.date_input("signup_date", value=date(2023, 1, 15))
    last_interaction_date = st.date_input("last_interaction_date", value=date.today())

    # Numéricas
    tenure_months = st.number_input("tenure_months", min_value=0, max_value=240, value=24, step=1)
    monthly_charge = st.number_input("monthly_charge", min_value=0.0, value=65.0, step=0.1)
    total_charges = st.number_input("total_charges", min_value=0.0, value=1560.0, step=0.1)
    support_tickets_30d = st.number_input("support_tickets_30d", min_value=0, max_value=30, value=1, step=1)
    num_services = st.number_input("num_services", min_value=1, max_value=5, value=3, step=1)
    avg_download_mbps = st.number_input("avg_download_mbps", min_value=1.0, value=180.0, step=1.0)
    downtime_hrs_30d = st.number_input("downtime_hrs_30d", min_value=0.0, value=1.5, step=0.1)
    late_payments_12m = st.number_input("late_payments_12m", min_value=0, max_value=24, value=0, step=1)

    # Categóricas (valores ejemplo; usa los que tengas en tu dataset)
    contract_type = st.selectbox("contract_type", ["Mes a mes", "1 año", "2 años"], index=0)
    payment_method = st.selectbox("payment_method", ["Tarjeta", "Débito", "Efectivo", "Billetera", "Cheque"], index=0)
    internet_service = st.selectbox("internet_service", ["Fibra", "Cable", "DSL", "Satélite"], index=0)
    promo_applied = st.selectbox("promo_applied", ["Sí", "No"], index=1)
    region = st.selectbox("region", ["Norte", "Centro", "Sur", "Oriente", "Lima Metropolitana"], index=4)
    device_type = st.selectbox("device_type", ["Modem", "Router", "Combo", "ONT", "Otro"], index=1)

    # Texto
    nps_text = st.text_input("nps_text (comentario breve)", "Todo bien")

    # Umbral
    threshold = st.slider("Umbral de decisión (churn si prob ≥ umbral)",
                          0.05, 0.95, 0.50, 0.01)

    submitted = st.form_submit_button("Predecir")

# -------------------------------------------------
# 4) Construcción de features como en el entrenamiento
#    (incluye derivadas days_since_signup / days_since_last_interaction)
# -------------------------------------------------
def build_features() -> pd.DataFrame:
    REFDATE = pd.Timestamp.today().normalize()
    s = pd.to_datetime(str(signup_date))
    li = pd.to_datetime(str(last_interaction_date))
    data = {
        # Numéricas originales
        "tenure_months": tenure_months,
        "monthly_charge": monthly_charge,
        "total_charges": total_charges,
        "support_tickets_30d": support_tickets_30d,
        "num_services": num_services,
        "avg_download_mbps": avg_download_mbps,
        "downtime_hrs_30d": downtime_hrs_30d,
        "late_payments_12m": late_payments_12m,
        # Derivadas de fecha
        "days_since_signup": (REFDATE - s).days if pd.notna(s) else None,
        "days_since_last_interaction": (REFDATE - li).days if pd.notna(li) else None,
        # Categóricas
        "contract_type": contract_type,
        "payment_method": payment_method,
        "internet_service": internet_service,
        "promo_applied": promo_applied,
        "region": region,
        "device_type": device_type,
        # Texto
        "nps_text": nps_text
    }
    return pd.DataFrame([data])

# -------------------------------------------------
# 5) Predicción
# -------------------------------------------------
if submitted:
    X_infer = build_features()
    proba = float(pipe.predict_proba(X_infer)[:, 1][0])
    pred = int(proba >= threshold)

    st.markdown("### Resultado")
    st.write(f"**Probabilidad de churn:** {proba:.3f}")
    st.write(f"**Predicción (umbral {threshold:.2f}):** {'Churn' if pred==1 else 'No churn'}")

    st.markdown("### Recomendaciones")
    recs = []
    if pred == 1:
        recs.append("- Ofrecer contrato de 1–2 años con beneficios (upgrade, descuento 3–6 meses).")
        recs.append("- Reducir downtime y TMR de soporte; ticket preventivo si hubo caídas.")
        recs.append("- Ajustar tarifa/paquete (bundle) y promociones personalizadas.")
        if late_payments_12m > 0:
            recs.append("- Recordatorios y fraccionamiento; incentivar pago automático.")
        if avg_download_mbps < 80:
            recs.append("- Proponer upgrade de plan/tecnología.")
    else:
        recs.append("- Cliente estable: habilitar cross-sell suave (upgrade de velocidad).")
        recs.append("- Mantener NPS con comunicaciones proactivas y estabilidad del servicio.")

    for r in recs:
        st.write(r)

st.caption("© Telco Churn Demo – Streamlit")


Overwriting app.py


In [13]:
from google.colab import files
uploaded = files.upload()


Saving telecom_churn_dataset.csv to telecom_churn_dataset.csv


In [14]:
# === (Re)entrenar modelo y guardar model.joblib — listo para la app ===
import os, pandas as pd, numpy as np, joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 0) Cargar CSV (intenta detectar separador)
csv_path = "telecom_churn_dataset.csv"  # cambia si tu archivo se llama distinto
assert os.path.exists(csv_path), f"No encuentro {csv_path}. Sube el CSV y vuelve a ejecutar."

def read_any_sep(path):
    # primer intento: autodetección
    try:
        df = pd.read_csv(path, sep=None, engine="python")
        # si quedó en 1 columna (todo pegado), probar con ';'
        if df.shape[1] == 1:
            df = pd.read_csv(path, sep=";")
    except Exception:
        df = pd.read_csv(path, sep=";")
    return df

df = read_any_sep(csv_path)
print("Shape:", df.shape)
print("Columnas:", list(df.columns))

# 1) Conversión de fechas + derivadas
for col in ["signup_date","last_interaction_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce", dayfirst=True)
REFDATE = pd.Timestamp.today().normalize()
df["days_since_signup"] = (REFDATE - df["signup_date"]).dt.days
df["days_since_last_interaction"] = (REFDATE - df["last_interaction_date"]).dt.days

# 2) Definir target y features
assert "churn" in df.columns, "Tu CSV debe tener la columna target 'churn'."
y = df["churn"].astype(int)

numeric_features = [
    "tenure_months","monthly_charge","total_charges","support_tickets_30d","num_services",
    "avg_download_mbps","downtime_hrs_30d","late_payments_12m",
    "days_since_signup","days_since_last_interaction"
]
categorical_features = [
    "contract_type","payment_method","internet_service","promo_applied","region","device_type"
]
text_feature = "nps_text"

# 3) Pipeline de preprocesamiento (sin lambdas anónimas)
def select_text_col(X: pd.DataFrame) -> pd.Series:
    # la app espera esta misma firma
    return X[text_feature].fillna("")

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
text_pipeline = Pipeline([
    ("selector", FunctionTransformer(select_text_col, validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num",  num_pipeline, numeric_features),
        ("cat",  cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# 4) Train/test y modelo
X = df[numeric_features + categorical_features + [text_feature]].copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

clf = LogisticRegression(max_iter=1000)
pipe = Pipeline([("preprocess", preprocess), ("clf", clf)])
pipe.fit(X_train, y_train)

# 5) Métricas rápidas
proba = pipe.predict_proba(X_test)[:,1]
pred = (proba >= 0.5).astype(int)
print("\nClassification report (umbral=0.50):\n", classification_report(y_test, pred, digits=3))
print("AUC test:", roc_auc_score(y_test, proba))

# 6) Guardar artefacto
joblib.dump(pipe, "model.joblib")
print("\n✅ Guardado: model.joblib")



Shape: (1500, 19)
Columnas: ['customer_id', 'signup_date', 'last_interaction_date', 'tenure_months', 'monthly_charge', 'total_charges', 'contract_type', 'payment_method', 'internet_service', 'support_tickets_30d', 'num_services', 'promo_applied', 'region', 'device_type', 'avg_download_mbps', 'downtime_hrs_30d', 'late_payments_12m', 'nps_text', 'churn']

Classification report (umbral=0.50):
               precision    recall  f1-score   support

           0      0.605     0.635     0.620       200
           1      0.558     0.526     0.541       175

    accuracy                          0.584       375
   macro avg      0.581     0.580     0.580       375
weighted avg      0.583     0.584     0.583       375

AUC test: 0.6202571428571428

✅ Guardado: model.joblib


In [15]:
# === Lanzar Streamlit con ngrok (URL directa) ===
!pip -q install streamlit pyngrok

import os, subprocess, time, socket
from pyngrok import ngrok

os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# cerrar previos
try:
    for t in ngrok.get_tunnels():
        ngrok.disconnect(t.public_url)
except: pass
subprocess.run(["pkill","-f","streamlit run app.py"], check=False)

# sanity
assert os.path.exists("app.py"), "No existe app.py"
assert os.path.exists("model.joblib"), "No existe model.joblib"

# lanzar
log = open("/tmp/streamlit.log","w")
proc = subprocess.Popen(
    ["streamlit","run","app.py","--server.port","8501","--server.headless","true"],
    stdout=log, stderr=subprocess.STDOUT, text=True
)

# esperar puerto
def wait_port(port=8501, timeout=60):
    t0=time.time()
    while time.time()-t0<timeout:
        try:
            with socket.create_connection(("127.0.0.1",port),timeout=1): return True
        except OSError: time.sleep(1)
    return False

if not wait_port():
    print("❌ No abrió el puerto 8501. Log:\n")
    !tail -n 200 /tmp/streamlit.log
else:
    public_url = ngrok.connect(8501,"http").public_url
    print("✅ Streamlit + ngrok listos")
    print("🌍 URL pública:", public_url)




✅ Streamlit + ngrok listos
🌍 URL pública: https://252a874ec40c.ngrok-free.app
