In [2]:
# ============================================
# 1) Instalación de dependencias
# ============================================
!pip -q install scikit-learn==1.4.2 shap==0.45.0 joblib==1.4.2 matplotlib==3.8.4 pandas==2.2.2 numpy==1.26.4 --no-cache-dir

# ============================================
# 2) Carga del CSV (subir archivo o usar link)
# ============================================
import os, pandas as pd, numpy as np, io
from google.colab import files

CSV_NAME = "telecom_churn_dataset.csv"

# Opción A: si ya tienes el archivo en Colab (p.ej., descargado)
if not os.path.exists(CSV_NAME):
    print("Sube tu CSV (telecom_churn_dataset.csv).")
    uploaded = files.upload()
    CSV_NAME = list(uploaded.keys())[0]

df = pd.read_csv(CSV_NAME)

# ============================================
# 3) EDA concisa
# ============================================
print("Shape:", df.shape)
print("\nDTypes:\n", df.dtypes)
print("\nNulos por columna:\n", df.isna().sum())
print("\nEstadísticos numéricos:\n", df.select_dtypes(include=[np.number]).describe().T)

# Top categorías (primeras 10 más frecuentes por columna categórica)
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
for c in cat_cols:
    print(f"\nTop categorías en {c}:\n", df[c].value_counts(dropna=False).head(10))

# Correlaciones rápidas (numéricas) – Top 10 en valor absoluto
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
corr = df[num_cols].corr(numeric_only=True)
tri = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
pairs = (
    tri.stack()
       .abs()
       .sort_values(ascending=False)
       .head(10)
)
print("\nTop correlaciones (abs):\n", pairs)

# ============================================
# 4) Preprocesamiento robusto
#    - Parseo de fechas + features derivadas
#    - Imputación, One-Hot, escalado, TF-IDF para texto
#    - Split estratificado
# ============================================
import datetime as dt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Convertir fechas y crear derivadas
for col in ["signup_date", "last_interaction_date"]:
    df[col] = pd.to_datetime(df[col], errors="coerce")

REFDATE = pd.Timestamp.today().normalize()
df["days_since_signup"] = (REFDATE - df["signup_date"]).dt.days
df["days_since_last_interaction"] = (REFDATE - df["last_interaction_date"]).dt.days

# Definir target y features
TARGET = "churn"
drop_cols = ["customer_id", "signup_date", "last_interaction_date"]
X = df.drop(columns=[TARGET] + drop_cols)
y = df[TARGET].astype(int)

# Listas por tipo
numeric_features = [
    "tenure_months","monthly_charge","total_charges","support_tickets_30d","num_services",
    "avg_download_mbps","downtime_hrs_30d","late_payments_12m","days_since_signup","days_since_last_interaction"
]
categorical_features = [
    "contract_type","payment_method","internet_service","promo_applied","region","device_type"
]
text_feature = "nps_text"

# Transformadores
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

text_pipeline = Pipeline(steps=[
    ("selector", FunctionTransformer(lambda X: X[text_feature].fillna(""), validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numeric_features),
        ("cat", cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# Split estratificado
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# ============================================
# 5) Entrenamiento de 2 modelos base + CV
# ============================================
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, n_jobs=None),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, random_state=42, class_weight="balanced_subsample", n_jobs=-1
    )
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = {}
for name, clf in models.items():
    pipe = Pipeline([("preprocess", preprocess), ("clf", clf)])
    f1 = cross_val_score(pipe, X_train, y_train, scoring="f1", cv=cv, n_jobs=-1)
    auc = cross_val_score(pipe, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=-1)
    scores[name] = {"F1_mean": f1.mean(), "AUC_mean": auc.mean()}
    print(f"\n{name} -> F1: {f1.mean():.3f} ± {f1.std():.3f} | AUC: {auc.mean():.3f} ± {auc.std():.3f}")

# Selección por F1
best_name = max(scores.items(), key=lambda kv: kv[1]["F1_mean"])[0]
print("\nMejor modelo:", best_name, scores[best_name])

best_clf = models[best_name]
best_pipe = Pipeline([("preprocess", preprocess), ("clf", best_clf)])
best_pipe.fit(X_train, y_train)

# ============================================
# 6) Evaluación final
# ============================================
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

y_proba = best_pipe.predict_proba(X_test)[:,1]
y_pred = (y_proba >= 0.5).astype(int)

print("\nClassification report (umbral=0.50):\n", classification_report(y_test, y_pred, digits=3))
print("AUC test:", roc_auc_score(y_test, y_proba))

# ============================================
# 7) Importancia de variables (perm_importance)
#    + agregación a columnas de alto nivel
# ============================================
from sklearn.inspection import permutation_importance
import numpy as np

# Importancias a nivel de features transformadas
r = permutation_importance(best_pipe, X_test, y_test, scoring="f1", n_repeats=10, random_state=42, n_jobs=-1)
feat_names = best_pipe.named_steps["preprocess"].get_feature_names_out()

imp_df = pd.DataFrame({"feature": feat_names, "importance": r.importances_mean}).sort_values("importance", ascending=False)
print("\nTop-20 features transformadas por importancia (perm_importance/F1):\n", imp_df.head(20))

# Agregar por columna original
def base_col(name: str):
    # ColumnTransformer nombra como: num__col, cat__col_categoria, text__tfidf__token
    if name.startswith("num__"):
        return name.split("__", 1)[1]
    if name.startswith("cat__"):
        # cat__col_valor -> col
        return name.split("__", 1)[1].split("_", 1)[0]
    if name.startswith("text__"):
        return "nps_text"
    return name

imp_df["base"] = imp_df["feature"].apply(base_col)
agg_imp = imp_df.groupby("base", as_index=False)["importance"].sum().sort_values("importance", ascending=False)
print("\nImportancia agregada por columna original:\n", agg_imp.head(10))

# ============================================
# 8) Recomendaciones accionables (automáticas)
# ============================================
def recomendaciones(agg_imp: pd.DataFrame):
    top = agg_imp.head(5)["base"].tolist()
    outs = []
    if "downtime_hrs_30d" in top or "support_tickets_30d" in top:
        outs.append("- Reducir caídas y TMR de soporte; SLA de reparación <24h en cuentas con alto downtime/tickets.")
    if "monthly_charge" in top or "total_charges" in top:
        outs.append("- Optimizar tarifa percibida: bundles y descuentos segmentados para planes mes a mes de alto ARPU.")
    if "contract_type" in top:
        outs.append("- Migrar a contratos 1–2 años con beneficios (exonerar penalidad, upgrade de equipo).")
    if "avg_download_mbps" in top:
        outs.append("- Priorizar mejoras de velocidad en zonas con Mbps promedio bajos; campañas proactivas de upgrade.")
    if "late_payments_12m" in top:
        outs.append("- Recordatorios y fraccionamiento para morosos leves; pago automático como incentivo.")
    if not outs:
        outs.append("- Ejecutar campañas de retención orientadas por los Top-5 drivers del modelo.")
    return "\n".join(outs)

print("\nRecomendaciones sugeridas:\n", recomendaciones(agg_imp))

# ============================================
# 9) Serialización del pipeline + metadatos
# ============================================
import joblib, time
artifact = {
    "pipeline": best_pipe,
    "model_name": best_name,
    "created_at": pd.Timestamp.now(tz="UTC").isoformat(),
    "feature_names_out": feat_names.tolist()
}
joblib.dump(artifact, "model.joblib")
print("\nArtefacto guardado: model.joblib")



[31mERROR: Operation cancelled by user[0m[31m
[0m^C
Shape: (1500, 1)

DTypes:
 customer_id;signup_date;last_interaction_date;tenure_months;monthly_charge;total_charges;contract_type;payment_method;internet_service;support_tickets_30d;num_services;promo_applied;region;device_type;avg_download_mbps;downtime_hrs_30d;late_payments_12m;nps_text;churn    object
dtype: object

Nulos por columna:
 customer_id;signup_date;last_interaction_date;tenure_months;monthly_charge;total_charges;contract_type;payment_method;internet_service;support_tickets_30d;num_services;promo_applied;region;device_type;avg_download_mbps;downtime_hrs_30d;late_payments_12m;nps_text;churn    0
dtype: int64


ValueError: Cannot describe a DataFrame without columns

In [None]:
# === CARGA ROBUSTA DEL CSV (elige archivo) ===
from google.colab import files
import pandas as pd, numpy as np

uploaded = files.upload()  # selecciona tu CSV limpio
fname = list(uploaded.keys())[0]
print("Archivo:", fname)

# Intento 1: coma + latin1 (útil si lo guardaste desde Excel/Windows)
try:
    df = pd.read_csv(fname, sep=",", encoding="latin1")
except Exception:
    # Intento 2: coma + utf-8
    df = pd.read_csv(fname, sep=",", encoding="utf-8")

print("Shape:", df.shape)
print(df.head(2))
print("\nColumnas:", list(df.columns))


In [None]:
# === ENTRENAMIENTO COMPLETO (sin instalaciones) ===
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
import joblib

# 1) Asegurar nombres esperados
expected = [
    "customer_id","signup_date","last_interaction_date","tenure_months","monthly_charge",
    "total_charges","contract_type","payment_method","internet_service","support_tickets_30d",
    "num_services","promo_applied","region","device_type","avg_download_mbps",
    "downtime_hrs_30d","late_payments_12m","nps_text","churn"
]
missing = [c for c in expected if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas en tu CSV: {missing}\n"
                     f"Por favor usa estos nombres exactamente (encabezados).")

# 2) Fechas → derivadas
for col in ["signup_date","last_interaction_date"]:
    df[col] = pd.to_datetime(df[col], errors="coerce")
REFDATE = pd.Timestamp.today().normalize()
df["days_since_signup"] = (REFDATE - df["signup_date"]).dt.days
df["days_since_last_interaction"] = (REFDATE - df["last_interaction_date"]).dt.days

# 3) Definir X, y
TARGET = "churn"
drop_cols = ["customer_id","signup_date","last_interaction_date"]
X = df.drop(columns=[TARGET] + drop_cols)
y = df[TARGET].astype(int)

numeric_features = [
    "tenure_months","monthly_charge","total_charges","support_tickets_30d","num_services",
    "avg_download_mbps","downtime_hrs_30d","late_payments_12m",
    "days_since_signup","days_since_last_interaction"
]
categorical_features = [
    "contract_type","payment_method","internet_service","promo_applied","region","device_type"
]
text_feature = "nps_text"

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
text_pipeline = Pipeline([
    ("selector", FunctionTransformer(lambda X: X[text_feature].fillna(""), validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numeric_features),
        ("cat", cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# 4) Split + modelos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, random_state=42, class_weight="balanced_subsample", n_jobs=-1
    )
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = {}
for name, clf in models.items():
    pipe = Pipeline([("preprocess", preprocess), ("clf", clf)])
    f1 = cross_val_score(pipe, X_train, y_train, scoring="f1", cv=cv, n_jobs=-1)
    auc = cross_val_score(pipe, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=-1)
    scores[name] = {"F1_mean": f1.mean(), "AUC_mean": auc.mean()}
    print(f"{name} -> F1: {f1.mean():.3f} ± {f1.std():.3f} | AUC: {auc.mean():.3f} ± {auc.std():.3f}")

best_name = max(scores.items(), key=lambda kv: kv[1]["F1_mean"])[0]
print("\nMejor modelo:", best_name, scores[best_name])

best_clf = models[best_name]
best_pipe = Pipeline([("preprocess", preprocess), ("clf", best_clf)])
best_pipe.fit(X_train, y_train)

# 5) Evaluación final
y_proba = best_pipe.predict_proba(X_test)[:,1]
y_pred = (y_proba >= 0.5).astype(int)
print("\nClassification report (umbral=0.50):\n", classification_report(y_test, y_pred, digits=3))
print("AUC test:", roc_auc_score(y_test, y_proba))

# 6) Importancias (perm_importance)
r = permutation_importance(best_pipe, X_test, y_test, scoring="f1", n_repeats=8, random_state=42, n_jobs=-1)
feat_names = best_pipe.named_steps["preprocess"].get_feature_names_out()
imp_df = pd.DataFrame({"feature": feat_names, "importance": r.importances_mean}).sort_values("importance", ascending=False)
print("\nTop-15 features transformadas por importancia:\n", imp_df.head(15))

# 7) Guardar artefacto
artifact = {
    "pipeline": best_pipe,
    "model_name": best_name,
    "created_at": pd.Timestamp.now(tz="UTC").isoformat(),
    "feature_names_out": feat_names.tolist()
}
joblib.dump(artifact, "model.joblib")
print("\n✅ Artefacto guardado: model.joblib")


In [1]:
!pip install -U --force-reinstall "scikit-learn==1.7.1" matplotlib -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m355.2/355.2 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import sklearn, matplotlib, numpy as np
print("sklearn:", sklearn.__version__)
print("numpy:", np.__version__)
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
print("✅ OK imports model_selection")


sklearn: 1.7.1
numpy: 1.26.4
✅ OK imports model_selection


In [3]:
import pandas as pd

df = pd.read_csv("telecom_churn_dataset.csv")
print(df.shape)
df.head()

(1500, 1)


Unnamed: 0,customer_id;signup_date;last_interaction_date;tenure_months;monthly_charge;total_charges;contract_type;payment_method;internet_service;support_tickets_30d;num_services;promo_applied;region;device_type;avg_download_mbps;downtime_hrs_30d;late_payments_12m;nps_text;churn
0,CUST01117;5/07/2019;;75;74.1875996;5541.11;Mes...
1,CUST01369;6/07/2022;6/09/2024;38;47.76425832;1...
2,CUST00423;18/04/2020;16/01/2025;63;64.81747361...
3,CUST00414;1/01/2021;7/01/2024;57;59.82235691;3...
4,CUST00452;13/12/2020;18/01/2022;57;45.99929277...


In [4]:
import pandas as pd

# Especifica que el separador es ';'
df = pd.read_csv("telecom_churn_dataset.csv", sep=";")

print(df.shape)
df.head()


(1500, 19)


Unnamed: 0,customer_id,signup_date,last_interaction_date,tenure_months,monthly_charge,total_charges,contract_type,payment_method,internet_service,support_tickets_30d,num_services,promo_applied,region,device_type,avg_download_mbps,downtime_hrs_30d,late_payments_12m,nps_text,churn
0,CUST01117,5/07/2019,,75,74.1876,5541.11,Mes a mes,Efectivo,Cable,0,3,No,Centro,Router,148.49254,2.263775,0,Muy satisfecho,0
1,CUST01369,6/07/2022,6/09/2024,38,47.764258,1794.29,Mes a mes,Efectivo,Cable,0,2,No,Centro,Router,185.686865,1.737182,1,Podría mejorar,0
2,CUST00423,18/04/2020,16/01/2025,63,64.817474,4175.53,Mes a mes,Débito,Cable,1,5,No,Centro,ONT,127.55469,0.352482,1,Cobro incorrecto,0
3,CUST00414,1/01/2021,7/01/2024,57,59.822357,3469.83,2 años,Tarjeta,Fibra,1,2,No,Lima Metropolitana,Modem,269.204635,2.523062,0,Todo bien,0
4,CUST00452,13/12/2020,18/01/2022,57,45.999293,2621.87,Mes a mes,Efectivo,DSL,1,2,Sí,Lima Metropolitana,Router,42.82315,3.089018,2,Rápido y estable,1


In [5]:
import pandas as pd
import numpy as np

# Si tu df ya está en memoria, no vuelvas a leerlo.
# Aseguramos parseo de fechas (tu formato es dd/mm/yyyy)
for col in ["signup_date", "last_interaction_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce", dayfirst=True)

print("Shape:", df.shape)
print("\nDTypes:\n", df.dtypes)
print("\nNulos por columna:\n", df.isna().sum())

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("\nEstadísticos numéricos:\n", df[num_cols].describe().T.head(10))


Shape: (1500, 19)

DTypes:
 customer_id                      object
signup_date              datetime64[ns]
last_interaction_date    datetime64[ns]
tenure_months                     int64
monthly_charge                  float64
total_charges                   float64
contract_type                    object
payment_method                   object
internet_service                 object
support_tickets_30d               int64
num_services                      int64
promo_applied                    object
region                           object
device_type                      object
avg_download_mbps               float64
downtime_hrs_30d                float64
late_payments_12m                 int64
nps_text                         object
churn                             int64
dtype: object

Nulos por columna:
 customer_id               0
signup_date               0
last_interaction_date    40
tenure_months             0
monthly_charge           50
total_charges             0
contract_

In [6]:
# === ENTRENAMIENTO COMPLETO (sin instalaciones) ===
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
import joblib

# 1) Verificar target
assert "churn" in df.columns, "La columna target debe llamarse 'churn'."

# 2) Derivados de fechas
for col in ["signup_date","last_interaction_date"]:
    df[col] = pd.to_datetime(df[col], errors="coerce", dayfirst=True)
REFDATE = pd.Timestamp.today().normalize()
df["days_since_signup"] = (REFDATE - df["signup_date"]).dt.days
df["days_since_last_interaction"] = (REFDATE - df["last_interaction_date"]).dt.days

# 3) Definir X, y
TARGET = "churn"
drop_cols = ["customer_id","signup_date","last_interaction_date"]
X = df.drop(columns=[TARGET] + drop_cols)
y = df[TARGET].astype(int)

numeric_features = [
    "tenure_months","monthly_charge","total_charges","support_tickets_30d","num_services",
    "avg_download_mbps","downtime_hrs_30d","late_payments_12m",
    "days_since_signup","days_since_last_interaction"
]
categorical_features = [
    "contract_type","payment_method","internet_service","promo_applied","region","device_type"
]
text_feature = "nps_text"

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
text_pipeline = Pipeline([
    ("selector", FunctionTransformer(lambda X: X[text_feature].fillna(""), validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numeric_features),
        ("cat", cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# 4) Split + modelos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, random_state=42, class_weight="balanced_subsample", n_jobs=-1
    )
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = {}
for name, clf in models.items():
    pipe = Pipeline([("preprocess", preprocess), ("clf", clf)])
    f1 = cross_val_score(pipe, X_train, y_train, scoring="f1", cv=cv, n_jobs=-1)
    auc = cross_val_score(pipe, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=-1)
    scores[name] = {"F1_mean": f1.mean(), "AUC_mean": auc.mean()}
    print(f"{name} -> F1: {f1.mean():.3f} ± {f1.std():.3f} | AUC: {auc.mean():.3f} ± {auc.std():.3f}")

best_name = max(scores.items(), key=lambda kv: kv[1]["F1_mean"])[0]
print("\nMejor modelo:", best_name, scores[best_name])

best_clf = models[best_name]
best_pipe = Pipeline([("preprocess", preprocess), ("clf", best_clf)])
best_pipe.fit(X_train, y_train)

# 5) Evaluación final
y_proba = best_pipe.predict_proba(X_test)[:,1]
y_pred = (y_proba >= 0.5).astype(int)
print("\nClassification report (umbral=0.50):\n", classification_report(y_test, y_pred, digits=3))
print("AUC test:", roc_auc_score(y_test, y_proba))

# 6) Importancias permutadas
r = permutation_importance(best_pipe, X_test, y_test, scoring="f1", n_repeats=8, random_state=42, n_jobs=-1)
feat_names = best_pipe.named_steps["preprocess"].get_feature_names_out()
imp_df = pd.DataFrame({"feature": feat_names, "importance": r.importances_mean}).sort_values("importance", ascending=False)
print("\nTop-15 features transformadas por importancia:\n", imp_df.head(15))

# 7) Guardar artefacto
artifact = {
    "pipeline": best_pipe,
    "model_name": best_name,
    "created_at": pd.Timestamp.now(tz="UTC").isoformat(),
    "feature_names_out": feat_names.tolist()
}
joblib.dump(artifact, "model.joblib")
print("\n✅ Artefacto guardado: model.joblib")


BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [7]:
# === ENTRENAMIENTO COMPLETO (sin instalaciones) — FIX joblib ===
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
import joblib

# 1) Verificar target
assert "churn" in df.columns, "La columna target debe llamarse 'churn'."

# 2) Fechas -> derivadas (tu CSV viene dd/mm/yyyy)
for col in ["signup_date","last_interaction_date"]:
    df[col] = pd.to_datetime(df[col], errors="coerce", dayfirst=True)
REFDATE = pd.Timestamp.today().normalize()
df["days_since_signup"] = (REFDATE - df["signup_date"]).dt.days
df["days_since_last_interaction"] = (REFDATE - df["last_interaction_date"]).dt.days

# 3) Definir X, y
TARGET = "churn"
drop_cols = ["customer_id","signup_date","last_interaction_date"]
X = df.drop(columns=[TARGET] + drop_cols)
y = df[TARGET].astype(int)

numeric_features = [
    "tenure_months","monthly_charge","total_charges","support_tickets_30d","num_services",
    "avg_download_mbps","downtime_hrs_30d","late_payments_12m",
    "days_since_signup","days_since_last_interaction"
]
categorical_features = [
    "contract_type","payment_method","internet_service","promo_applied","region","device_type"
]
text_feature = "nps_text"

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
text_pipeline = Pipeline([
    ("selector", FunctionTransformer(lambda X: X[text_feature].fillna(""), validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numeric_features),
        ("cat", cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# 4) Split + modelos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, random_state=42, class_weight="balanced_subsample", n_jobs=-1  # <- OK dejar paralelo aquí
    )
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = {}
for name, clf in models.items():
    pipe = Pipeline([("preprocess", preprocess), ("clf", clf)])
    # 🔧 NO paralelizar cross_val_score para evitar el error de joblib
    f1 = cross_val_score(pipe, X_train, y_train, scoring="f1", cv=cv, n_jobs=1)
    auc = cross_val_score(pipe, X_train, y_train, scoring="roc_auc", cv=cv, n_jobs=1)
    scores[name] = {"F1_mean": f1.mean(), "AUC_mean": auc.mean()}
    print(f"{name} -> F1: {f1.mean():.3f} ± {f1.std():.3f} | AUC: {auc.mean():.3f} ± {auc.std():.3f}")

best_name = max(scores.items(), key=lambda kv: kv[1]["F1_mean"])[0]
print("\nMejor modelo:", best_name, scores[best_name])

best_clf = models[best_name]
best_pipe = Pipeline([("preprocess", preprocess), ("clf", best_clf)])
best_pipe.fit(X_train, y_train)

# 5) Evaluación final
y_proba = best_pipe.predict_proba(X_test)[:,1]
y_pred = (y_proba >= 0.5).astype(int)
print("\nClassification report (umbral=0.50):\n", classification_report(y_test, y_pred, digits=3))
print("AUC test:", roc_auc_score(y_test, y_proba))

# 6) Importancias permutadas — también sin paralelismo
r = permutation_importance(best_pipe, X_test, y_test, scoring="f1", n_repeats=8, random_state=42, n_jobs=1)
feat_names = best_pipe.named_steps["preprocess"].get_feature_names_out()
imp_df = pd.DataFrame({"feature": feat_names, "importance": r.importances_mean}).sort_values("importance", ascending=False)
print("\nTop-15 features transformadas por importancia:\n", imp_df.head(15))

# 7) Guardar artefacto
artifact = {
    "pipeline": best_pipe,
    "model_name": best_name,
    "created_at": pd.Timestamp.now(tz="UTC").isoformat(),
    "feature_names_out": feat_names.tolist()
}
joblib.dump(artifact, "model.joblib")
print("\n✅ Artefacto guardado: model.joblib")


LogisticRegression -> F1: 0.555 ± 0.035 | AUC: 0.660 ± 0.030
RandomForest -> F1: 0.551 ± 0.021 | AUC: 0.648 ± 0.024

Mejor modelo: LogisticRegression {'F1_mean': 0.5554129809135371, 'AUC_mean': 0.660022637986012}

Classification report (umbral=0.50):
               precision    recall  f1-score   support

           0      0.605     0.635     0.620       200
           1      0.558     0.526     0.541       175

    accuracy                          0.584       375
   macro avg      0.581     0.580     0.580       375
weighted avg      0.583     0.584     0.583       375

AUC test: 0.6202571428571428


AttributeError: Estimator selector does not provide get_feature_names_out. Did you mean to call pipeline[:-1].get_feature_names_out()?

In [8]:
# ==== FIX: obtención robusta de nombres de features + importancias ====
import numpy as np
import pandas as pd
from sklearn.inspection import permutation_importance

pre = best_pipe.named_steps["preprocess"]  # ColumnTransformer ya entrenado

def get_feature_names_robust(pre):
    # 1) Numéricas (tal cual)
    num_cols = pre.transformers_[0][2]

    # 2) Categóricas: usar OneHot para obtener nombres expandidos
    cat_cols = pre.transformers_[1][2]
    ohe = pre.named_transformers_["cat"].named_steps["onehot"]
    cat_names = ohe.get_feature_names_out(cat_cols)

    # 3) Texto (TF-IDF): prefijar para distinguir
    tfidf = pre.named_transformers_["text"].named_steps["tfidf"]
    tf_names = np.array([f"text__{t}" for t in tfidf.get_feature_names_out()])

    return np.concatenate([np.array(num_cols, dtype=object), cat_names, tf_names], axis=0)

feat_names = get_feature_names_robust(pre)

# Importancias permutadas (sin paralelismo para evitar errores de joblib)
r = permutation_importance(best_pipe, X_test, y_test, scoring="f1",
                           n_repeats=8, random_state=42, n_jobs=1)

imp_df = pd.DataFrame({"feature": feat_names, "importance": r.importances_mean}) \
         .sort_values("importance", ascending=False)

print("\nTop-15 features transformadas por importancia:\n", imp_df.head(15))


ValueError: All arrays must be of the same length

In [9]:
# === Importancias permutadas con nombres robustos (alineación defensiva) ===
import numpy as np
import pandas as pd
from sklearn.inspection import permutation_importance

pre = best_pipe.named_steps["preprocess"]  # ColumnTransformer ya entrenado

def get_feature_names_robust(pre):
    # 1) Numéricas
    num_cols = list(pre.transformers_[0][2])

    # 2) Categóricas
    cat_cols = list(pre.transformers_[1][2])
    ohe = pre.named_transformers_["cat"].named_steps["onehot"]
    cat_names = list(ohe.get_feature_names_out(cat_cols))

    # 3) Texto (TF-IDF)
    tfidf = pre.named_transformers_["text"].named_steps["tfidf"]
    tf_names = [f"text__{t}" for t in tfidf.get_feature_names_out()]

    return num_cols + cat_names + tf_names

feat_names = get_feature_names_robust(pre)

# Importancias permutadas (sin paralelismo para evitar errores de joblib)
r = permutation_importance(
    best_pipe, X_test, y_test,
    scoring="f1", n_repeats=8, random_state=42, n_jobs=1
)

m = len(r.importances_mean)
n = len(feat_names)
print(f"nombres={n}  importancias={m}")

# Alinear longitudes de forma segura
if n > m:
    feat_names = feat_names[:m]  # recortar sobrantes
elif n < m:
    # rellenar con nombres genéricos si faltan (no debería pasar, pero prevenimos)
    feat_names = feat_names + [f"unknown_{i}" for i in range(m - n)]

imp_df = pd.DataFrame({
    "feature": feat_names,
    "importance": r.importances_mean
}).sort_values("importance", ascending=False)

print("\nTop-15 features transformadas por importancia:\n", imp_df.head(15))


nombres=68  importancias=17

Top-15 features transformadas por importancia:
                         feature  importance
11         contract_type_2 años    0.054459
9   days_since_last_interaction    0.009891
15        payment_method_Débito    0.007967
0                 tenure_months    0.005830
14        payment_method_Cheque    0.003844
10          contract_type_1 año    0.003248
2                 total_charges    0.003123
1                monthly_charge    0.002891
8             days_since_signup    0.002574
5             avg_download_mbps    0.000082
13     payment_method_Billetera   -0.000321
3           support_tickets_30d   -0.000610
7             late_payments_12m   -0.001105
12      contract_type_Mes a mes   -0.001587
4                  num_services   -0.004124


In [10]:
import joblib

joblib.dump(best_pipe, "model.joblib")
print("✅ Modelo guardado como model.joblib")


PicklingError: Can't pickle <function <lambda> at 0x7f12a6654fe0>: it's not found as __main__.<lambda>

In [12]:
from sklearn.preprocessing import FunctionTransformer

# ❌ Esto causa el error
# transformer = FunctionTransformer(lambda x: x+1)

# ✅ Define la función fuera
def add_one(x):
    return x + 1

transformer = FunctionTransformer(add_one)


In [13]:
import joblib
joblib.dump(best_pipe, "model.joblib")


PicklingError: Can't pickle <function <lambda> at 0x7f12a6654fe0>: it's not found as __main__.<lambda>

In [15]:
for name, step in best_pipe.named_steps.items():
    print(name, step)


preprocess ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', RobustScaler())]),
                                 ['tenure_months', 'monthly_charge',
                                  'total_charges', 'support_tickets_30d',
                                  'num_services', 'avg_download_mbps',
                                  'downtime_hrs_30d', 'late_payments_12m',
                                  'days_since_signup',
                                  'days_since_last_interaction']),
                                ('cat',
                                 Pipeline(steps=[(...
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(han

In [16]:
def custom_transform(x):
    # lo que hacías con lambda
    return x + 1

transformer = FunctionTransformer(custom_transform)


In [17]:
('selector', FunctionTransformer(func=<function <lambda> ...>))


SyntaxError: invalid syntax (ipython-input-13400442.py, line 1)

In [19]:
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

# Define función en lugar de lambda
def select_text_col(X: pd.DataFrame) -> pd.Series:
    return X["nps_text"].fillna("")

# Crea el transformador
selector = FunctionTransformer(select_text_col, validate=False)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

text_pipeline = Pipeline([
    ("selector", selector),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])


In [21]:
import joblib

joblib.dump(best_pipe, "model.joblib")
print("✅ Modelo guardado correctamente en model.joblib")


PicklingError: Can't pickle <function <lambda> at 0x7f12a6654fe0>: it's not found as __main__.<lambda>

In [22]:
# === 1) Reconstruir el ColumnTransformer sin lambdas y reentrenar ===
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib, re

# --- función en lugar de lambda ---
def select_text_col(X: pd.DataFrame) -> pd.Series:
    return X["nps_text"].fillna("")

# Pipelines por tipo
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

text_pipeline = Pipeline([
    ("selector", FunctionTransformer(select_text_col, validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

# ColumnTransformer nuevo (SIN lambda)
preprocess_fixed = ColumnTransformer(
    transformers=[
        ("num",  num_pipeline, numeric_features),
        ("cat",  cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# Elegir el mismo clasificador ganador si existe, sino LR
clf = LogisticRegression(max_iter=1000)
try:
    if best_name == "RandomForest":
        clf = RandomForestClassifier(
            n_estimators=300, random_state=42, class_weight="balanced_subsample", n_jobs=-1
        )
except NameError:
    pass

# Pipeline nuevo y ENTRENAR
best_pipe = Pipeline([("preprocess", preprocess_fixed), ("clf", clf)])
best_pipe.fit(X_train, y_train)

# --- Sanidad: verificar que NO queden lambdas en la representación ---
repr_text = repr(best_pipe)
print("¿Queda lambda?:", "<lambda>" in repr_text)

# === 2) Guardar con joblib (debe funcionar ahora) ===
joblib.dump(best_pipe, "model.joblib")
print("✅ Modelo guardado correctamente en model.joblib")


¿Queda lambda?: False
✅ Modelo guardado correctamente en model.joblib


In [23]:
%%writefile app.py
import streamlit as st
import joblib
import pandas as pd

@st.cache_resource
def load_model():
    return joblib.load("model.joblib")

pipe = load_model()

st.title("📊 Predicción de Churn con NPS Text")

tenure = st.number_input("Meses de antigüedad (tenure_months)", min_value=0, step=1)
monthly_charge = st.number_input("Cargo mensual (monthly_charge)", min_value=0.0, step=0.1)
total_charges = st.number_input("Cargos totales (total_charges)", min_value=0.0, step=0.1)
contract_type = st.selectbox("Tipo de contrato", ["Month-to-month", "One year", "Two year"])
payment_method = st.selectbox("Método de pago", ["Electronic check", "Mailed check", "Bank transfer", "Credit card"])
internet_service = st.selectbox("Servicio de internet", ["DSL", "Fiber optic", "No"])
region = st.selectbox("Región", ["North", "South", "East", "West"])
nps_text = st.text_area("Comentario del cliente (nps_text)", "")

if st.button("Predecir"):
    data = pd.DataFrame([{
        "tenure_months": tenure,
        "monthly_charge": monthly_charge,
        "total_charges": total_charges,
        "contract_type": contract_type,
        "payment_method": payment_method,
        "internet_service": internet_service,
        "region": region,
        "nps_text": nps_text
    }])

    pred = pipe.predict(data)[0]
    prob = pipe.predict_proba(data)[0][1]

    st.success(f"✅ Predicción: {'Churn' if pred == 1 else 'No Churn'}")
    st.info(f"📌 Probabilidad de churn: {prob:.2%}")


Writing app.py


In [24]:
!pip install streamlit pyngrok -q


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [28]:
# === 1) Instalar dependencias ===
!pip install streamlit pyngrok -q

# === 2) Conectar con tu token de ngrok ===
from pyngrok import ngrok
!ngrok authtoken 31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd

# === 3) Lanzar Streamlit y exponerlo con ngrok ===
!streamlit run app.py --server.port 8501 & npx localtunnel --port 8501


[31mERROR: Operation cancelled by user[0m[31m
[0m

KeyboardInterrupt: 

In [27]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from datetime import date

# -------------------------------------------------
# 1) Funciones necesarias para cargar el modelo
#    (esta función existía cuando entrenaste el pipeline)
# -------------------------------------------------
def select_text_col(X: pd.DataFrame) -> pd.Series:
    # Mismo nombre/función que usaste al entrenar
    # Devuelve la columna de texto 'nps_text' con nulos como cadena vacía
    return X["nps_text"].fillna("")

# -------------------------------------------------
# 2) Cargar el pipeline entrenado
# -------------------------------------------------
@st.cache_resource
def load_model(path="model.joblib"):
    # Gracias a que definimos select_text_col arriba,
    # joblib puede deserializar el pipeline sin error.
    return joblib.load(path)

pipe = load_model()

# -------------------------------------------------
# 3) UI de Streamlit
#    Coincide con las FEATURES usadas al entrenar
#    (numéricas, categóricas y texto)
# -------------------------------------------------
st.set_page_config(page_title="Churn Telco", page_icon="📶", layout="centered")
st.title("📶 Predicción de Churn (Telco)")

st.caption("Modelo cargado desde **model.joblib**")

with st.form("form_inputs"):
    st.subheader("Datos del cliente")

    # Fechas para calcular 'days_since_*'
    signup_date = st.date_input("signup_date", value=date(2023, 1, 15))
    last_interaction_date = st.date_input("last_interaction_date", value=date.today())

    # Numéricas
    tenure_months = st.number_input("tenure_months", min_value=0, max_value=240, value=24, step=1)
    monthly_charge = st.number_input("monthly_charge", min_value=0.0, value=65.0, step=0.1)
    total_charges = st.number_input("total_charges", min_value=0.0, value=1560.0, step=0.1)
    support_tickets_30d = st.number_input("support_tickets_30d", min_value=0, max_value=30, value=1, step=1)
    num_services = st.number_input("num_services", min_value=1, max_value=5, value=3, step=1)
    avg_download_mbps = st.number_input("avg_download_mbps", min_value=1.0, value=180.0, step=1.0)
    downtime_hrs_30d = st.number_input("downtime_hrs_30d", min_value=0.0, value=1.5, step=0.1)
    late_payments_12m = st.number_input("late_payments_12m", min_value=0, max_value=24, value=0, step=1)

    # Categóricas (valores ejemplo; usa los que tengas en tu dataset)
    contract_type = st.selectbox("contract_type", ["Mes a mes", "1 año", "2 años"], index=0)
    payment_method = st.selectbox("payment_method", ["Tarjeta", "Débito", "Efectivo", "Billetera", "Cheque"], index=0)
    internet_service = st.selectbox("internet_service", ["Fibra", "Cable", "DSL", "Satélite"], index=0)
    promo_applied = st.selectbox("promo_applied", ["Sí", "No"], index=1)
    region = st.selectbox("region", ["Norte", "Centro", "Sur", "Oriente", "Lima Metropolitana"], index=4)
    device_type = st.selectbox("device_type", ["Modem", "Router", "Combo", "ONT", "Otro"], index=1)

    # Texto
    nps_text = st.text_input("nps_text (comentario breve)", "Todo bien")

    # Umbral
    threshold = st.slider("Umbral de decisión (churn si prob ≥ umbral)",
                          0.05, 0.95, 0.50, 0.01)

    submitted = st.form_submit_button("Predecir")

# -------------------------------------------------
# 4) Construcción de features como en el entrenamiento
#    (incluye derivadas days_since_signup / days_since_last_interaction)
# -------------------------------------------------
def build_features() -> pd.DataFrame:
    REFDATE = pd.Timestamp.today().normalize()
    s = pd.to_datetime(str(signup_date))
    li = pd.to_datetime(str(last_interaction_date))
    data = {
        # Numéricas originales
        "tenure_months": tenure_months,
        "monthly_charge": monthly_charge,
        "total_charges": total_charges,
        "support_tickets_30d": support_tickets_30d,
        "num_services": num_services,
        "avg_download_mbps": avg_download_mbps,
        "downtime_hrs_30d": downtime_hrs_30d,
        "late_payments_12m": late_payments_12m,
        # Derivadas de fecha
        "days_since_signup": (REFDATE - s).days if pd.notna(s) else None,
        "days_since_last_interaction": (REFDATE - li).days if pd.notna(li) else None,
        # Categóricas
        "contract_type": contract_type,
        "payment_method": payment_method,
        "internet_service": internet_service,
        "promo_applied": promo_applied,
        "region": region,
        "device_type": device_type,
        # Texto
        "nps_text": nps_text
    }
    return pd.DataFrame([data])

# -------------------------------------------------
# 5) Predicción
# -------------------------------------------------
if submitted:
    X_infer = build_features()
    proba = float(pipe.predict_proba(X_infer)[:, 1][0])
    pred = int(proba >= threshold)

    st.markdown("### Resultado")
    st.write(f"**Probabilidad de churn:** {proba:.3f}")
    st.write(f"**Predicción (umbral {threshold:.2f}):** {'Churn' if pred==1 else 'No churn'}")

    st.markdown("### Recomendaciones")
    recs = []
    if pred == 1:
        recs.append("- Ofrecer contrato de 1–2 años con beneficios (upgrade, descuento 3–6 meses).")
        recs.append("- Reducir downtime y TMR de soporte; ticket preventivo si hubo caídas.")
        recs.append("- Ajustar tarifa/paquete (bundle) y promociones personalizadas.")
        if late_payments_12m > 0:
            recs.append("- Recordatorios y fraccionamiento; incentivar pago automático.")
        if avg_download_mbps < 80:
            recs.append("- Proponer upgrade de plan/tecnología.")
    else:
        recs.append("- Cliente estable: habilitar cross-sell suave (upgrade de velocidad).")
        recs.append("- Mantener NPS con comunicaciones proactivas y estabilidad del servicio.")

    for r in recs:
        st.write(r)

st.caption("© Telco Churn Demo – Streamlit")


Overwriting app.py


In [29]:
# === Lanzar Streamlit con ngrok (URL directa) ===
!pip -q install streamlit pyngrok

import os, subprocess, time
from pyngrok import ngrok

# Tu token
os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# Cerrar procesos/túneles previos
try:
  for t in ngrok.get_tunnels():
      ngrok.disconnect(t.public_url)
except:
  pass

try:
  subprocess.run(["pkill","-f","streamlit run app.py"], check=False)
except:
  pass

# Iniciar Streamlit en background
proc = subprocess.Popen(
    ["streamlit", "run", "app.py", "--server.port", "8501", "--server.headless", "true"],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)

# Abrir túnel ngrok al puerto 8501
time.sleep(3)
public_url = ngrok.connect(8501, "http").public_url
print("\n✅ Tu app está lista:")
print("URL pública:", public_url)
print("Si no carga, vuelve a ejecutar esta celda para reiniciar el túnel.")



✅ Tu app está lista:
URL pública: https://53740e9c7118.ngrok-free.app
Si no carga, vuelve a ejecutar esta celda para reiniciar el túnel.


In [1]:
# === Lanzar Streamlit con ngrok (URL directa) ===
!pip -q install streamlit pyngrok

import os, subprocess, time
from pyngrok import ngrok

# Tu token
os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# Cerrar procesos/túneles previos
try:
  for t in ngrok.get_tunnels():
      ngrok.disconnect(t.public_url)
except:
  pass

try:
  subprocess.run(["pkill","-f","streamlit run app.py"], check=False)
except:
  pass

# Iniciar Streamlit en background
proc = subprocess.Popen(
    ["streamlit", "run", "app.py", "--server.port", "8501", "--server.headless", "true"],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)

# Abrir túnel ngrok al puerto 8501
time.sleep(3)
public_url = ngrok.connect(8501, "http").public_url
print("\n✅ Tu app está lista:")
print("URL pública:", public_url)
print("Si no carga, vuelve a ejecutar esta celda para reiniciar el túnel.")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m

✅ Tu app está lista:
URL pública: https://8dc6d867ae07.ngrok-free.app
Si no carga, vuelve a ejecutar esta celda para reiniciar el túnel.


In [2]:
# 🔁 REINICIAR STREAMLIT + NGROK (todo en uno)
!pip -q install streamlit pyngrok

import os, subprocess, time, sys
from pyngrok import ngrok

# 1) Configurar token (si ya lo pusiste antes no pasa nada)
os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# 2) Cerrar túneles/procesos previos
try:
    for t in ngrok.get_tunnels():
        ngrok.disconnect(t.public_url)
except Exception:
    pass

try:
    subprocess.run(["pkill","-f","streamlit run app.py"], check=False)
except Exception:
    pass

# 3) Lanzar Streamlit en background
log = open("/tmp/streamlit.log", "w")
proc = subprocess.Popen(
    ["streamlit", "run", "app.py", "--server.port", "8501", "--server.headless", "true"],
    stdout=log, stderr=subprocess.STDOUT, text=True
)

# 4) Esperar a que arranque y abrir túnel
time.sleep(4)
public_url = ngrok.connect(8501, "http").public_url
print("✅ Streamlit + ngrok listos")
print("🌍 URL pública:", public_url)
print("📜 Log en /tmp/streamlit.log  (usa: !tail -n 100 /tmp/streamlit.log)")




✅ Streamlit + ngrok listos
🌍 URL pública: https://4743fe043502.ngrok-free.app
📜 Log en /tmp/streamlit.log  (usa: !tail -n 100 /tmp/streamlit.log)


In [3]:
# 🔁 Relanzar Streamlit + ngrok con verificación de salud
!pip -q install streamlit pyngrok

import os, subprocess, time, socket, sys
from pyngrok import ngrok

# 1) Token ngrok
os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# 2) Cerrar túneles/procesos previos
try:
    for t in ngrok.get_tunnels():
        ngrok.disconnect(t.public_url)
except Exception:
    pass
subprocess.run(["pkill","-f","streamlit run app.py"], check=False)

# 3) Validaciones básicas
assert os.path.exists("app.py"), "No existe app.py en /content"
assert os.path.exists("model.joblib"), "No existe model.joblib en /content (vuelve a entrenar/guardar)"

# 4) Arrancar Streamlit en background y logear
log_path = "/tmp/streamlit.log"
log = open(log_path, "w")
proc = subprocess.Popen(
    ["streamlit","run","app.py","--server.port","8501","--server.headless","true"],
    stdout=log, stderr=subprocess.STDOUT, text=True
)

# 5) Esperar a que el puerto esté abierto (máx ~60s)
def wait_port(port=8501, timeout=60):
    t0 = time.time()
    while time.time() - t0 < timeout:
        try:
            with socket.create_connection(("127.0.0.1", port), timeout=1):
                return True
        except OSError:
            time.sleep(1)
    return False

ok = wait_port(8501, 60)

# 6) Si no abrió, mostrar log y abortar
if not ok:
    print("❌ Streamlit no abrió el puerto 8501 en 60s. Últimas líneas del log:\n")
    !tail -n 200 /tmp/streamlit.log
    raise SystemExit

# 7) Abrir túnel y mostrar URL
public_url = ngrok.connect(8501, "http").public_url
print("✅ Streamlit + ngrok listos")
print("🌍 URL pública:", public_url)
print("📜 Log: tail -n 100 /tmp/streamlit.log")




AssertionError: No existe app.py en /content

In [4]:
!pwd
!ls -lha


/content
total 16K
drwxr-xr-x 1 root root 4.0K Aug 14 13:36 .
drwxr-xr-x 1 root root 4.0K Aug 16 20:02 ..
drwxr-xr-x 4 root root 4.0K Aug 14 13:35 .config
drwxr-xr-x 1 root root 4.0K Aug 14 13:36 sample_data


In [5]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from datetime import date

def select_text_col(X: pd.DataFrame) -> pd.Series:
    return X["nps_text"].fillna("")

@st.cache_resource
def load_model(path="model.joblib"):
    return joblib.load(path)

pipe = load_model()

st.set_page_config(page_title="Churn Telco", page_icon="📶", layout="centered")
st.title("📶 Predicción de Churn (Telco)")

with st.form("form_inputs"):
    signup_date = st.date_input("signup_date", value=date(2023, 1, 15))
    last_interaction_date = st.date_input("last_interaction_date", value=date.today())

    tenure_months = st.number_input("tenure_months", 0, 240, 24, 1)
    monthly_charge = st.number_input("monthly_charge", 0.0, step=0.1, value=65.0)
    total_charges = st.number_input("total_charges", 0.0, step=0.1, value=1560.0)
    support_tickets_30d = st.number_input("support_tickets_30d", 0, 30, 1, 1)
    num_services = st.number_input("num_services", 1, 5, 3, 1)
    avg_download_mbps = st.number_input("avg_download_mbps", 1.0, step=1.0, value=180.0)
    downtime_hrs_30d = st.number_input("downtime_hrs_30d", 0.0, step=0.1, value=1.5)
    late_payments_12m = st.number_input("late_payments_12m", 0, 24, 0, 1)

    contract_type = st.selectbox("contract_type", ["Mes a mes","1 año","2 años"])
    payment_method = st.selectbox("payment_method", ["Tarjeta","Débito","Efectivo","Billetera","Cheque"])
    internet_service = st.selectbox("internet_service", ["Fibra","Cable","DSL","Satélite"])
    promo_applied = st.selectbox("promo_applied", ["Sí","No"], index=1)
    region = st.selectbox("region", ["Norte","Centro","Sur","Oriente","Lima Metropolitana"], index=4)
    device_type = st.selectbox("device_type", ["Modem","Router","Combo","ONT","Otro"], index=1)

    nps_text = st.text_input("nps_text (comentario breve)", "Todo bien")
    threshold = st.slider("Umbral de decisión (churn si prob ≥ umbral)", 0.05, 0.95, 0.50, 0.01)

    submitted = st.form_submit_button("Predecir")

def build_features() -> pd.DataFrame:
    REFDATE = pd.Timestamp.today().normalize()
    s = pd.to_datetime(str(signup_date))
    li = pd.to_datetime(str(last_interaction_date))
    return pd.DataFrame([{
        "tenure_months": tenure_months,
        "monthly_charge": monthly_charge,
        "total_charges": total_charges,
        "support_tickets_30d": support_tickets_30d,
        "num_services": num_services,
        "avg_download_mbps": avg_download_mbps,
        "downtime_hrs_30d": downtime_hrs_30d,
        "late_payments_12m": late_payments_12m,
        "days_since_signup": (pd.Timestamp.today().normalize() - s).days,
        "days_since_last_interaction": (pd.Timestamp.today().normalize() - li).days,
        "contract_type": contract_type,
        "payment_method": payment_method,
        "internet_service": internet_service,
        "promo_applied": promo_applied,
        "region": region,
        "device_type": device_type,
        "nps_text": nps_text
    }])

if submitted:
    X_infer = build_features()
    proba = float(pipe.predict_proba(X_infer)[:,1][0])
    pred = int(proba >= threshold)

    st.markdown("### Resultado")
    st.write(f"**Probabilidad de churn:** {proba:.3f}")
    st.write(f"**Predicción (umbral {threshold:.2f}):** {'Churn' if pred==1 else 'No churn'}")

    st.markdown("### Recomendaciones")
    recs = []
    if pred == 1:
        recs += [
            "- Ofrecer contrato 1–2 años con beneficios.",
            "- Reducir downtime y TMR de soporte; acción preventiva.",
            "- Ajustar tarifa/paquete y promociones personalizadas."
        ]
        if late_payments_12m > 0: recs.append("- Recordatorios / fraccionamiento; pago automático.")
        if avg_download_mbps < 80: recs.append("- Proponer upgrade de plan/tecnología.")
    else:
        recs += [
            "- Cliente estable: cross-sell suave (upgrade de velocidad).",
            "- Mantener NPS con comunicaciones proactivas."
        ]
    for r in recs: st.write(r)

st.caption("© Telco Churn Demo – Streamlit")


Writing app.py


In [6]:
# 🔁 Relanzar Streamlit + ngrok con verificación de salud
!pip -q install streamlit pyngrok

import os, subprocess, time, socket, sys
from pyngrok import ngrok

# 1) Token ngrok
os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# 2) Cerrar túneles/procesos previos
try:
    for t in ngrok.get_tunnels():
        ngrok.disconnect(t.public_url)
except Exception:
    pass
subprocess.run(["pkill","-f","streamlit run app.py"], check=False)

# 3) Validaciones básicas
assert os.path.exists("app.py"), "No existe app.py en /content"
assert os.path.exists("model.joblib"), "No existe model.joblib en /content (vuelve a entrenar/guardar)"

# 4) Arrancar Streamlit en background y logear
log_path = "/tmp/streamlit.log"
log = open(log_path, "w")
proc = subprocess.Popen(
    ["streamlit","run","app.py","--server.port","8501","--server.headless","true"],
    stdout=log, stderr=subprocess.STDOUT, text=True
)

# 5) Esperar a que el puerto esté abierto (máx ~60s)
def wait_port(port=8501, timeout=60):
    t0 = time.time()
    while time.time() - t0 < timeout:
        try:
            with socket.create_connection(("127.0.0.1", port), timeout=1):
                return True
        except OSError:
            time.sleep(1)
    return False

ok = wait_port(8501, 60)

# 6) Si no abrió, mostrar log y abortar
if not ok:
    print("❌ Streamlit no abrió el puerto 8501 en 60s. Últimas líneas del log:\n")
    !tail -n 200 /tmp/streamlit.log
    raise SystemExit

# 7) Abrir túnel y mostrar URL
public_url = ngrok.connect(8501, "http").public_url
print("✅ Streamlit + ngrok listos")
print("🌍 URL pública:", public_url)
print("📜 Log: tail -n 100 /tmp/streamlit.log")


AssertionError: No existe model.joblib en /content (vuelve a entrenar/guardar)

In [7]:
from google.colab import drive
drive.mount('/content/drive')

# Ajusta la ruta si lo guardaste en otra carpeta
!cp "/content/drive/MyDrive/model.joblib" .
!ls -lha


Mounted at /content/drive
cp: cannot stat '/content/drive/MyDrive/model.joblib': No such file or directory
total 24K
drwxr-xr-x 1 root root 4.0K Aug 16 20:40 .
drwxr-xr-x 1 root root 4.0K Aug 16 20:02 ..
-rw-r--r-- 1 root root 4.0K Aug 16 20:38 app.py
drwxr-xr-x 4 root root 4.0K Aug 14 13:35 .config
drwx------ 5 root root 4.0K Aug 16 20:40 drive
drwxr-xr-x 1 root root 4.0K Aug 14 13:36 sample_data


In [8]:
# Relanzar Streamlit + ngrok con verificación
!pip -q install streamlit pyngrok

import os, subprocess, time, socket
from pyngrok import ngrok

os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# cerrar previos
try:
    for t in ngrok.get_tunnels():
        ngrok.disconnect(t.public_url)
except: pass
subprocess.run(["pkill","-f","streamlit run app.py"], check=False)

# sanity
assert os.path.exists("app.py"), "No existe app.py"
assert os.path.exists("model.joblib"), "No existe model.joblib"

# lanzar
log = open("/tmp/streamlit.log","w")
proc = subprocess.Popen(
    ["streamlit","run","app.py","--server.port","8501","--server.headless","true"],
    stdout=log, stderr=subprocess.STDOUT, text=True
)

# esperar puerto
def wait_port(port=8501, timeout=60):
    t0=time.time()
    import socket
    while time.time()-t0<timeout:
        try:
            with socket.create_connection(("127.0.0.1",port),timeout=1): return True
        except OSError: time.sleep(1)
    return False

if not wait_port():
    print("❌ No abrió el puerto 8501. Log:\n")
    !tail -n 200 /tmp/streamlit.log
else:
    public_url = ngrok.connect(8501,"http").public_url
    print("✅ Streamlit + ngrok listos")
    print("🌍 URL pública:", public_url)


AssertionError: No existe model.joblib

In [9]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
from datetime import date

# -------------------------------------------------
# 1) Funciones necesarias para cargar el modelo
#    (esta función existía cuando entrenaste el pipeline)
# -------------------------------------------------
def select_text_col(X: pd.DataFrame) -> pd.Series:
    # Mismo nombre/función que usaste al entrenar
    # Devuelve la columna de texto 'nps_text' con nulos como cadena vacía
    return X["nps_text"].fillna("")

# -------------------------------------------------
# 2) Cargar el pipeline entrenado
# -------------------------------------------------
@st.cache_resource
def load_model(path="model.joblib"):
    # Gracias a que definimos select_text_col arriba,
    # joblib puede deserializar el pipeline sin error.
    return joblib.load(path)

pipe = load_model()

# -------------------------------------------------
# 3) UI de Streamlit
#    Coincide con las FEATURES usadas al entrenar
#    (numéricas, categóricas y texto)
# -------------------------------------------------
st.set_page_config(page_title="Churn Telco", page_icon="📶", layout="centered")
st.title("📶 Predicción de Churn (Telco)")

st.caption("Modelo cargado desde **model.joblib**")

with st.form("form_inputs"):
    st.subheader("Datos del cliente")

    # Fechas para calcular 'days_since_*'
    signup_date = st.date_input("signup_date", value=date(2023, 1, 15))
    last_interaction_date = st.date_input("last_interaction_date", value=date.today())

    # Numéricas
    tenure_months = st.number_input("tenure_months", min_value=0, max_value=240, value=24, step=1)
    monthly_charge = st.number_input("monthly_charge", min_value=0.0, value=65.0, step=0.1)
    total_charges = st.number_input("total_charges", min_value=0.0, value=1560.0, step=0.1)
    support_tickets_30d = st.number_input("support_tickets_30d", min_value=0, max_value=30, value=1, step=1)
    num_services = st.number_input("num_services", min_value=1, max_value=5, value=3, step=1)
    avg_download_mbps = st.number_input("avg_download_mbps", min_value=1.0, value=180.0, step=1.0)
    downtime_hrs_30d = st.number_input("downtime_hrs_30d", min_value=0.0, value=1.5, step=0.1)
    late_payments_12m = st.number_input("late_payments_12m", min_value=0, max_value=24, value=0, step=1)

    # Categóricas (valores ejemplo; usa los que tengas en tu dataset)
    contract_type = st.selectbox("contract_type", ["Mes a mes", "1 año", "2 años"], index=0)
    payment_method = st.selectbox("payment_method", ["Tarjeta", "Débito", "Efectivo", "Billetera", "Cheque"], index=0)
    internet_service = st.selectbox("internet_service", ["Fibra", "Cable", "DSL", "Satélite"], index=0)
    promo_applied = st.selectbox("promo_applied", ["Sí", "No"], index=1)
    region = st.selectbox("region", ["Norte", "Centro", "Sur", "Oriente", "Lima Metropolitana"], index=4)
    device_type = st.selectbox("device_type", ["Modem", "Router", "Combo", "ONT", "Otro"], index=1)

    # Texto
    nps_text = st.text_input("nps_text (comentario breve)", "Todo bien")

    # Umbral
    threshold = st.slider("Umbral de decisión (churn si prob ≥ umbral)",
                          0.05, 0.95, 0.50, 0.01)

    submitted = st.form_submit_button("Predecir")

# -------------------------------------------------
# 4) Construcción de features como en el entrenamiento
#    (incluye derivadas days_since_signup / days_since_last_interaction)
# -------------------------------------------------
def build_features() -> pd.DataFrame:
    REFDATE = pd.Timestamp.today().normalize()
    s = pd.to_datetime(str(signup_date))
    li = pd.to_datetime(str(last_interaction_date))
    data = {
        # Numéricas originales
        "tenure_months": tenure_months,
        "monthly_charge": monthly_charge,
        "total_charges": total_charges,
        "support_tickets_30d": support_tickets_30d,
        "num_services": num_services,
        "avg_download_mbps": avg_download_mbps,
        "downtime_hrs_30d": downtime_hrs_30d,
        "late_payments_12m": late_payments_12m,
        # Derivadas de fecha
        "days_since_signup": (REFDATE - s).days if pd.notna(s) else None,
        "days_since_last_interaction": (REFDATE - li).days if pd.notna(li) else None,
        # Categóricas
        "contract_type": contract_type,
        "payment_method": payment_method,
        "internet_service": internet_service,
        "promo_applied": promo_applied,
        "region": region,
        "device_type": device_type,
        # Texto
        "nps_text": nps_text
    }
    return pd.DataFrame([data])

# -------------------------------------------------
# 5) Predicción
# -------------------------------------------------
if submitted:
    X_infer = build_features()
    proba = float(pipe.predict_proba(X_infer)[:, 1][0])
    pred = int(proba >= threshold)

    st.markdown("### Resultado")
    st.write(f"**Probabilidad de churn:** {proba:.3f}")
    st.write(f"**Predicción (umbral {threshold:.2f}):** {'Churn' if pred==1 else 'No churn'}")

    st.markdown("### Recomendaciones")
    recs = []
    if pred == 1:
        recs.append("- Ofrecer contrato de 1–2 años con beneficios (upgrade, descuento 3–6 meses).")
        recs.append("- Reducir downtime y TMR de soporte; ticket preventivo si hubo caídas.")
        recs.append("- Ajustar tarifa/paquete (bundle) y promociones personalizadas.")
        if late_payments_12m > 0:
            recs.append("- Recordatorios y fraccionamiento; incentivar pago automático.")
        if avg_download_mbps < 80:
            recs.append("- Proponer upgrade de plan/tecnología.")
    else:
        recs.append("- Cliente estable: habilitar cross-sell suave (upgrade de velocidad).")
        recs.append("- Mantener NPS con comunicaciones proactivas y estabilidad del servicio.")

    for r in recs:
        st.write(r)

st.caption("© Telco Churn Demo – Streamlit")


Overwriting app.py


In [10]:
# === Lanzar Streamlit con ngrok (URL directa) ===
!pip -q install streamlit pyngrok

import os, subprocess, time
from pyngrok import ngrok

# Ponemos tu token aquí para automatizar
os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# Cerrar procesos previos / túneles, si los hubiera
try:
  for t in ngrok.get_tunnels():
      ngrok.disconnect(t.public_url)
except:
  pass

try:
  subprocess.run(["pkill","-f","streamlit run app.py"], check=False)
except:
  pass

# Iniciar Streamlit en background
proc = subprocess.Popen(
    ["streamlit", "run", "app.py", "--server.port", "8501", "--server.headless", "true"],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)

# Abrir túnel ngrok al puerto 8501
time.sleep(3)
public_url = ngrok.connect(8501, "http").public_url
print("\n✅ Tu app está lista:")
print("URL pública:", public_url)
print("Si no carga, vuelve a ejecutar esta celda para reiniciar el túnel.")



✅ Tu app está lista:
URL pública: https://b84c288fd869.ngrok-free.app
Si no carga, vuelve a ejecutar esta celda para reiniciar el túnel.


In [11]:
import joblib
joblib.dump(best_pipe, "model.joblib")


NameError: name 'best_pipe' is not defined

In [12]:
# === (Re)entrenar modelo y guardar model.joblib — listo para la app ===
import os, pandas as pd, numpy as np, joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 0) Cargar CSV (intenta detectar separador)
csv_path = "telecom_churn_dataset.csv"  # cambia si tu archivo se llama distinto
assert os.path.exists(csv_path), f"No encuentro {csv_path}. Sube el CSV y vuelve a ejecutar."

def read_any_sep(path):
    # primer intento: autodetección
    try:
        df = pd.read_csv(path, sep=None, engine="python")
        # si quedó en 1 columna (todo pegado), probar con ';'
        if df.shape[1] == 1:
            df = pd.read_csv(path, sep=";")
    except Exception:
        df = pd.read_csv(path, sep=";")
    return df

df = read_any_sep(csv_path)
print("Shape:", df.shape)
print("Columnas:", list(df.columns))

# 1) Conversión de fechas + derivadas
for col in ["signup_date","last_interaction_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce", dayfirst=True)
REFDATE = pd.Timestamp.today().normalize()
df["days_since_signup"] = (REFDATE - df["signup_date"]).dt.days
df["days_since_last_interaction"] = (REFDATE - df["last_interaction_date"]).dt.days

# 2) Definir target y features
assert "churn" in df.columns, "Tu CSV debe tener la columna target 'churn'."
y = df["churn"].astype(int)

numeric_features = [
    "tenure_months","monthly_charge","total_charges","support_tickets_30d","num_services",
    "avg_download_mbps","downtime_hrs_30d","late_payments_12m",
    "days_since_signup","days_since_last_interaction"
]
categorical_features = [
    "contract_type","payment_method","internet_service","promo_applied","region","device_type"
]
text_feature = "nps_text"

# 3) Pipeline de preprocesamiento (sin lambdas anónimas)
def select_text_col(X: pd.DataFrame) -> pd.Series:
    # la app espera esta misma firma
    return X[text_feature].fillna("")

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
text_pipeline = Pipeline([
    ("selector", FunctionTransformer(select_text_col, validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num",  num_pipeline, numeric_features),
        ("cat",  cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# 4) Train/test y modelo
X = df[numeric_features + categorical_features + [text_feature]].copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

clf = LogisticRegression(max_iter=1000)
pipe = Pipeline([("preprocess", preprocess), ("clf", clf)])
pipe.fit(X_train, y_train)

# 5) Métricas rápidas
proba = pipe.predict_proba(X_test)[:,1]
pred = (proba >= 0.5).astype(int)
print("\nClassification report (umbral=0.50):\n", classification_report(y_test, pred, digits=3))
print("AUC test:", roc_auc_score(y_test, proba))

# 6) Guardar artefacto
joblib.dump(pipe, "model.joblib")
print("\n✅ Guardado: model.joblib")


AssertionError: No encuentro telecom_churn_dataset.csv. Sube el CSV y vuelve a ejecutar.

In [13]:
from google.colab import files
uploaded = files.upload()


Saving telecom_churn_dataset.csv to telecom_churn_dataset.csv


In [14]:
# === (Re)entrenar modelo y guardar model.joblib — listo para la app ===
import os, pandas as pd, numpy as np, joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 0) Cargar CSV (intenta detectar separador)
csv_path = "telecom_churn_dataset.csv"  # cambia si tu archivo se llama distinto
assert os.path.exists(csv_path), f"No encuentro {csv_path}. Sube el CSV y vuelve a ejecutar."

def read_any_sep(path):
    # primer intento: autodetección
    try:
        df = pd.read_csv(path, sep=None, engine="python")
        # si quedó en 1 columna (todo pegado), probar con ';'
        if df.shape[1] == 1:
            df = pd.read_csv(path, sep=";")
    except Exception:
        df = pd.read_csv(path, sep=";")
    return df

df = read_any_sep(csv_path)
print("Shape:", df.shape)
print("Columnas:", list(df.columns))

# 1) Conversión de fechas + derivadas
for col in ["signup_date","last_interaction_date"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce", dayfirst=True)
REFDATE = pd.Timestamp.today().normalize()
df["days_since_signup"] = (REFDATE - df["signup_date"]).dt.days
df["days_since_last_interaction"] = (REFDATE - df["last_interaction_date"]).dt.days

# 2) Definir target y features
assert "churn" in df.columns, "Tu CSV debe tener la columna target 'churn'."
y = df["churn"].astype(int)

numeric_features = [
    "tenure_months","monthly_charge","total_charges","support_tickets_30d","num_services",
    "avg_download_mbps","downtime_hrs_30d","late_payments_12m",
    "days_since_signup","days_since_last_interaction"
]
categorical_features = [
    "contract_type","payment_method","internet_service","promo_applied","region","device_type"
]
text_feature = "nps_text"

# 3) Pipeline de preprocesamiento (sin lambdas anónimas)
def select_text_col(X: pd.DataFrame) -> pd.Series:
    # la app espera esta misma firma
    return X[text_feature].fillna("")

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
text_pipeline = Pipeline([
    ("selector", FunctionTransformer(select_text_col, validate=False)),
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=300))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num",  num_pipeline, numeric_features),
        ("cat",  cat_pipeline, categorical_features),
        ("text", text_pipeline, [text_feature]),
    ],
    remainder="drop"
)

# 4) Train/test y modelo
X = df[numeric_features + categorical_features + [text_feature]].copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

clf = LogisticRegression(max_iter=1000)
pipe = Pipeline([("preprocess", preprocess), ("clf", clf)])
pipe.fit(X_train, y_train)

# 5) Métricas rápidas
proba = pipe.predict_proba(X_test)[:,1]
pred = (proba >= 0.5).astype(int)
print("\nClassification report (umbral=0.50):\n", classification_report(y_test, pred, digits=3))
print("AUC test:", roc_auc_score(y_test, proba))

# 6) Guardar artefacto
joblib.dump(pipe, "model.joblib")
print("\n✅ Guardado: model.joblib")



Shape: (1500, 19)
Columnas: ['customer_id', 'signup_date', 'last_interaction_date', 'tenure_months', 'monthly_charge', 'total_charges', 'contract_type', 'payment_method', 'internet_service', 'support_tickets_30d', 'num_services', 'promo_applied', 'region', 'device_type', 'avg_download_mbps', 'downtime_hrs_30d', 'late_payments_12m', 'nps_text', 'churn']

Classification report (umbral=0.50):
               precision    recall  f1-score   support

           0      0.605     0.635     0.620       200
           1      0.558     0.526     0.541       175

    accuracy                          0.584       375
   macro avg      0.581     0.580     0.580       375
weighted avg      0.583     0.584     0.583       375

AUC test: 0.6202571428571428

✅ Guardado: model.joblib


In [15]:
# === Lanzar Streamlit con ngrok (URL directa) ===
!pip -q install streamlit pyngrok

import os, subprocess, time, socket
from pyngrok import ngrok

os.environ["NGROK_TOKEN"] = "31NHe4eJPI5qac9m7Nf7HjCJYD8_5ZtQFBUtPhq5vDu8GPaXd"
ngrok.set_auth_token(os.environ["NGROK_TOKEN"])

# cerrar previos
try:
    for t in ngrok.get_tunnels():
        ngrok.disconnect(t.public_url)
except: pass
subprocess.run(["pkill","-f","streamlit run app.py"], check=False)

# sanity
assert os.path.exists("app.py"), "No existe app.py"
assert os.path.exists("model.joblib"), "No existe model.joblib"

# lanzar
log = open("/tmp/streamlit.log","w")
proc = subprocess.Popen(
    ["streamlit","run","app.py","--server.port","8501","--server.headless","true"],
    stdout=log, stderr=subprocess.STDOUT, text=True
)

# esperar puerto
def wait_port(port=8501, timeout=60):
    t0=time.time()
    while time.time()-t0<timeout:
        try:
            with socket.create_connection(("127.0.0.1",port),timeout=1): return True
        except OSError: time.sleep(1)
    return False

if not wait_port():
    print("❌ No abrió el puerto 8501. Log:\n")
    !tail -n 200 /tmp/streamlit.log
else:
    public_url = ngrok.connect(8501,"http").public_url
    print("✅ Streamlit + ngrok listos")
    print("🌍 URL pública:", public_url)




✅ Streamlit + ngrok listos
🌍 URL pública: https://252a874ec40c.ngrok-free.app
