In [1]:
# === (Re)entrenar modelo y guardar model.joblib — listo para la app ===
import os, pandas as pd, numpy as np, joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 0) Cargar CSV (intenta detectar separador)
csv_path = "telecom_churn_dataset.csv"  # cambia si tu archivo se llama distinto
assert os.path.exists(csv_path), f"No encuentro {csv_path}. Sube el CSV y vuelve a ejecutar."

def read_any_sep(path):
    # primer intento: autodetección
    try:
        df = pd.read_csv(path, sep=None, engine="python")
        # si quedó en 1 columna (todo pegado), probar con ';'
        if df.shape[1] == 1:
            df = pd.read_csv(path, sep=";")
    except Exception:
        df = pd.read_csv(path, sep=";")
    return df

df = read_any_sep(csv_path)
print("Shape:", df.shape)
print("Columnas:", list(df.columns))

# 1) Conversión de fechas + derivadas
for col in ["signup_date","last_interaction_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce", dayfirst=True)
REFDATE = pd.Timestamp.today().normalize()
df["days_since_signup"] = (REFDATE - df["signup_date"]).dt.days
df["days_since_last_interaction"] = (REFDATE - df["last_interaction_date"]).dt.days

# 2) Definir target y features
assert "churn" in df.columns, "Tu CSV debe tener la columna target 'churn'."
y = df["churn"].astype(int)

numeric_features = [
    "tenure_months","monthly_charge","total_charges","support_tickets_30d","num_services",
    "avg_download_mbps","downtime_hrs_30d","late_payments_12m",
    "days_since_signup","days_since_last_interaction"
]
categorical_features = [
    "contract_type","payment_method","internet_service","promo_applied","region","device_type"
]
text_feature = "nps_text"

# 3) Pipeline de preprocesamiento (sin lambdas anónimas)
def select_text_col(X: pd.DataFrame) -> pd.Series:
    # la app espera esta misma firma
    return X[text_feature].fillna("")

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
text_pipeline = Pipeline([
    ("selector", FunctionTransformer(select_text_col, validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num",  num_pipeline, numeric_features),
        ("cat",  cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# 4) Train/test y modelo
X = df[numeric_features + categorical_features + [text_feature]].copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

clf = LogisticRegression(max_iter=1000)
pipe = Pipeline([("preprocess", preprocess), ("clf", clf)])
pipe.fit(X_train, y_train)

# 5) Métricas rápidas
proba = pipe.predict_proba(X_test)[:,1]
pred = (proba >= 0.5).astype(int)
print("\nClassification report (umbral=0.50):\n", classification_report(y_test, pred, digits=3))
print("AUC test:", roc_auc_score(y_test, proba))

# 6) Guardar artefacto
joblib.dump(pipe, "model.joblib")
print("\n✅ Guardado: model.joblib")


Shape: (1500, 19)
Columnas: ['customer_id', 'signup_date', 'last_interaction_date', 'tenure_months', 'monthly_charge', 'total_charges', 'contract_type', 'payment_method', 'internet_service', 'support_tickets_30d', 'num_services', 'promo_applied', 'region', 'device_type', 'avg_download_mbps', 'downtime_hrs_30d', 'late_payments_12m', 'nps_text', 'churn']

Classification report (umbral=0.50):
               precision    recall  f1-score   support

           0      0.605     0.635     0.620       200
           1      0.558     0.526     0.541       175

    accuracy                          0.584       375
   macro avg      0.581     0.580     0.580       375
weighted avg      0.583     0.584     0.583       375

AUC test: 0.6202571428571428

✅ Guardado: model.joblib


In [2]:
!pip install streamlit pyngrok

Collecting streamlit
  Downloading streamlit-1.48.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.48.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m108.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (7

In [3]:
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from datetime import date

# -------------------------------------------------
# 1) Funciones necesarias para cargar el modelo
#    (esta función existía cuando entrenaste el pipeline)
# -------------------------------------------------
def select_text_col(X: pd.DataFrame) -> pd.Series:
    # Mismo nombre/función que usaste al entrenar
    # Devuelve la columna de texto 'nps_text' con nulos como cadena vacía
    return X["nps_text"].fillna("")

# -------------------------------------------------
# 2) Cargar el pipeline entrenado
# -------------------------------------------------
@st.cache_resource
def load_model(path="model.joblib"):
    # Gracias a que definimos select_text_col arriba,
    # joblib puede deserializar el pipeline sin error.
    return joblib.load(path)

pipe = load_model()

# -------------------------------------------------
# 3) UI de Streamlit
#    Coincide con las FEATURES usadas al entrenar
#    (numéricas, categóricas y texto)
# -------------------------------------------------
st.set_page_config(page_title="Churn Telco", page_icon="📶", layout="centered")
st.title("📶 Predicción de Churn (Telco)")

st.caption("Modelo cargado desde **model.joblib**")

with st.form("form_inputs"):
    st.subheader("Datos del cliente")

    # Fechas para calcular 'days_since_*'
    signup_date = st.date_input("signup_date", value=date(2023, 1, 15))
    last_interaction_date = st.date_input("last_interaction_date", value=date.today())

    # Numéricas
    tenure_months = st.number_input("tenure_months", min_value=0, max_value=240, value=24, step=1)
    monthly_charge = st.number_input("monthly_charge", min_value=0.0, value=65.0, step=0.1)
    total_charges = st.number_input("total_charges", min_value=0.0, value=1560.0, step=0.1)
    support_tickets_30d = st.number_input("support_tickets_30d", min_value=0, max_value=30, value=1, step=1)
    num_services = st.number_input("num_services", min_value=1, max_value=5, value=3, step=1)
    avg_download_mbps = st.number_input("avg_download_mbps", min_value=1.0, value=180.0, step=1.0)
    downtime_hrs_30d = st.number_input("downtime_hrs_30d", min_value=0.0, value=1.5, step=0.1)
    late_payments_12m = st.number_input("late_payments_12m", min_value=0, max_value=24, value=0, step=1)

    # Categóricas (valores ejemplo; usa los que tengas en tu dataset)
    contract_type = st.selectbox("contract_type", ["Mes a mes", "1 año", "2 años"], index=0)
    payment_method = st.selectbox("payment_method", ["Tarjeta", "Débito", "Efectivo", "Billetera", "Cheque"], index=0)
    internet_service = st.selectbox("internet_service", ["Fibra", "Cable", "DSL", "Satélite"], index=0)
    promo_applied = st.selectbox("promo_applied", ["Sí", "No"], index=1)
    region = st.selectbox("region", ["Norte", "Centro", "Sur", "Oriente", "Lima Metropolitana"], index=4)
    device_type = st.selectbox("device_type", ["Modem", "Router", "Combo", "ONT", "Otro"], index=1)

    # Texto
    nps_text = st.text_input("nps_text (comentario breve)", "Todo bien")

    # Umbral
    threshold = st.slider("Umbral de decisión (churn si prob ≥ umbral)",
                          0.05, 0.95, 0.50, 0.01)

    submitted = st.form_submit_button("Predecir")

# -------------------------------------------------
# 4) Construcción de features como en el entrenamiento
#    (incluye derivadas days_since_signup / days_since_last_interaction)
# -------------------------------------------------
def build_features() -> pd.DataFrame:
    REFDATE = pd.Timestamp.today().normalize()
    s = pd.to_datetime(str(signup_date))
    li = pd.to_datetime(str(last_interaction_date))
    data = {
        # Numéricas originales
        "tenure_months": tenure_months,
        "monthly_charge": monthly_charge,
        "total_charges": total_charges,
        "support_tickets_30d": support_tickets_30d,
        "num_services": num_services,
        "avg_download_mbps": avg_download_mbps,
        "downtime_hrs_30d": downtime_hrs_30d,
        "late_payments_12m": late_payments_12m,
        # Derivadas de fecha
        "days_since_signup": (REFDATE - s).days if pd.notna(s) else None,
        "days_since_last_interaction": (REFDATE - li).days if pd.notna(li) else None,
        # Categóricas
        "contract_type": contract_type,
        "payment_method": payment_method,
        "internet_service": internet_service,
        "promo_applied": promo_applied,
        "region": region,
        "device_type": device_type,
        # Texto
        "nps_text": nps_text
    }
    return pd.DataFrame([data])

# -------------------------------------------------
# 5) Predicción
# -------------------------------------------------
if submitted:
    X_infer = build_features()
    proba = float(pipe.predict_proba(X_infer)[:, 1][0])
    pred = int(proba >= threshold)

    st.markdown("### Resultado")
    st.write(f"**Probabilidad de churn:** {proba:.3f}")
    st.write(f"**Predicción (umbral {threshold:.2f}):** {'Churn' if pred==1 else 'No churn'}")

    st.markdown("### Recomendaciones")
    recs = []
    if pred == 1:
        recs.append("- Ofrecer contrato de 1–2 años con beneficios (upgrade, descuento 3–6 meses).")
        recs.append("- Reducir downtime y TMR de soporte; ticket preventivo si hubo caídas.")
        recs.append("- Ajustar tarifa/paquete (bundle) y promociones personalizadas.")
        if late_payments_12m > 0:
            recs.append("- Recordatorios y fraccionamiento; incentivar pago automático.")
        if avg_download_mbps < 80:
            recs.append("- Proponer upgrade de plan/tecnología.")
    else:
        recs.append("- Cliente estable: habilitar cross-sell suave (upgrade de velocidad).")
        recs.append("- Mantener NPS con comunicaciones proactivas y estabilidad del servicio.")

    for r in recs:
        st.write(r)

st.caption("© Telco Churn Demo – Streamlit")


2025-08-17 16:02:57.517 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-08-17 16:02:57.806 Session state does not function when running a script without `streamlit run`


DeltaGenerator(_form_data=FormData(form_id='form_inputs'))

In [4]:
# === Lanzar Streamlit con ngrok (URL directa) ===
!pip -q install streamlit pyngrok

import os, subprocess, time, socket
from pyngrok import ngrok

os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# cerrar previos
try:
    for t in ngrok.get_tunnels():
        ngrok.disconnect(t.public_url)
except: pass
subprocess.run(["pkill","-f","streamlit run app.py"], check=False)

# sanity
assert os.path.exists("app.py"), "No existe app.py"
assert os.path.exists("model.joblib"), "No existe model.joblib"

# lanzar
log = open("/tmp/streamlit.log","w")
proc = subprocess.Popen(
    ["streamlit","run","app.py","--server.port","8501","--server.headless","true"],
    stdout=log, stderr=subprocess.STDOUT, text=True
)

# esperar puerto
def wait_port(port=8501, timeout=60):
    t0=time.time()
    while time.time()-t0<timeout:
        try:
            with socket.create_connection(("127.0.0.1",port),timeout=1): return True
        except OSError: time.sleep(1)
    return False

if not wait_port():
    print("❌ No abrió el puerto 8501. Log:\n")
    !tail -n 200 /tmp/streamlit.log
else:
    public_url = ngrok.connect(8501,"http").public_url
    print("✅ Streamlit + ngrok listos")
    print("🌍 URL pública:", public_url)


✅ Streamlit + ngrok listos
🌍 URL pública: https://20b3040f39ad.ngrok-free.app
