In [1]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [2]:
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.7.4.5-py3-none-any.whl.metadata (16 kB)
Downloading kaggle-1.7.4.5-py3-none-any.whl (181 kB)
Installing collected packages: kaggle
Successfully installed kaggle-1.7.4.5


In [3]:
!kaggle datasets download -d blastchar/telco-customer-churn -p .

Dataset URL: https://www.kaggle.com/datasets/blastchar/telco-customer-churn
License(s): copyright-authors
Downloading telco-customer-churn.zip to .
  0%|                                                | 0.00/172k [00:00<?, ?B/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 172k/172k [00:00<00:00, 644MB/s]


In [4]:
!unzip -o telco-customer-churn.zip

Archive:  telco-customer-churn.zip
  inflating: WA_Fn-UseC_-Telco-Customer-Churn.csv  


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load CSV (adjust path if needed)
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# 1. Drop customerID (not useful for prediction)
df = df.drop(columns=["customerID"])

# 2. Fix TotalCharges that are empty strings
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

# 3. Convert target column 'Churn' to numeric (0/1)
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})

# 4. One-hot encode all categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# 5. Separate features and label
y = df_encoded["Churn"]
X = df_encoded.drop(columns=["Churn"])

# 6. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 7. ðŸ”¥ Critical fix: convert ALL boolean columns to integers
X_train = X_train.astype(int)
X_test = X_test.astype(int)

# 8. XGBoost in SageMaker expects "label column first"
train_df = pd.concat([y_train, X_train], axis=1)
test_df  = pd.concat([y_test,  X_test],  axis=1)

train_df.head()

Unnamed: 0,Churn,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
3738,0,0,35,49,1701,1,0,0,0,1,...,0,1,0,1,0,0,0,0,1,0
3151,0,0,15,75,1151,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1
4860,0,0,13,40,590,1,1,1,0,1,...,0,0,0,0,0,1,0,0,0,1
3867,0,0,26,73,1905,0,1,0,1,0,...,0,1,0,1,0,1,1,1,0,0
3810,0,0,1,44,44,1,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
train_df.to_csv("telco_train.csv", index=False, header=False)
test_df.to_csv("telco_test.csv", index=False, header=False)

In [11]:
import sagemaker
from sagemaker import Session

session = sagemaker.Session()
bucket = session.default_bucket()  # SageMaker auto-creates this bucket
prefix = "telco-churn"            # folder prefix in S3

print("S3 bucket:", bucket)


S3 bucket: sagemaker-us-east-1-473191218617


In [12]:
s3_train_path = session.upload_data(
    "telco_train.csv",
    bucket=bucket,
    key_prefix=f"{prefix}/train"
)

s3_test_path = session.upload_data(
    "telco_test.csv",
    bucket=bucket,
    key_prefix=f"{prefix}/test"
)

s3_train_path, s3_test_path

('s3://sagemaker-us-east-1-473191218617/telco-churn/train/telco_train.csv',
 's3://sagemaker-us-east-1-473191218617/telco-churn/test/telco_test.csv')

In [13]:
import boto3
import sagemaker
from sagemaker.image_uris import retrieve

session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# Get XGBoost training container
xgboost_image = retrieve("xgboost", region, version="1.5-1")

from sagemaker.estimator import Estimator

xgb_estimator = Estimator(
    image_uri=xgboost_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size=5,
    max_run=3600,
    sagemaker_session=session,
)

# Hyperparameters for XGBoost
xgb_estimator.set_hyperparameters(
    objective="binary:logistic",
    eval_metric="auc",
    num_round=200,
    max_depth=5,
    eta=0.2,
    subsample=0.8,
    colsample_bytree=0.8,
)

from sagemaker.inputs import TrainingInput

train_input = TrainingInput(
    s3_data=s3_train_path,
    content_type="text/csv"
)

validation_input = TrainingInput(
    s3_data=s3_test_path,
    content_type="text/csv"
)

# TRAIN THE MODEL
xgb_estimator.fit({"train": train_input, "validation": validation_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-20-22-47-53-742


2025-11-20 22:47:56 Starting - Starting the training job...
2025-11-20 22:48:10 Starting - Preparing the instances for training...
2025-11-20 22:48:32 Downloading - Downloading input data...
2025-11-20 22:49:17 Downloading - Downloading the training image......
2025-11-20 22:50:29 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-20 22:50:20.509 ip-10-0-73-18.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-20 22:50:20.536 ip-10-0-73-18.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-20:22:50:20:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-20:22:50:20:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-20:22:50:20:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value

In [15]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# Deploy the trained XGBoost model to a live endpoint
predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"   # you can switch to 'ml.t2.medium' to save cost
)

predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2025-11-20-23-15-24-956
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2025-11-20-23-15-24-956
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2025-11-20-23-15-24-956


------!

In [16]:
# Take one example row from X_test
sample = X_test.iloc[0:1]
print("Sample input features:\n", sample)

# Convert the row to CSV format (required by SageMaker XGBoost endpoint)
payload = ",".join([str(x) for x in sample.values.flatten().tolist()])

print("\nPayload sent to endpoint:\n", payload)

# Call the deployed model
prediction = predictor.predict(payload)
prediction

Sample input features:
      SeniorCitizen  tenure  MonthlyCharges  TotalCharges  gender_Male  \
437              0      72             114          8468            1   

     Partner_Yes  Dependents_Yes  PhoneService_Yes  \
437            1               1                 1   

     MultipleLines_No phone service  MultipleLines_Yes  ...  \
437                               0                  1  ...   

     StreamingTV_No internet service  StreamingTV_Yes  \
437                                0                1   

     StreamingMovies_No internet service  StreamingMovies_Yes  \
437                                    0                    1   

     Contract_One year  Contract_Two year  PaperlessBilling_Yes  \
437                  0                  1                     1   

     PaymentMethod_Credit card (automatic)  PaymentMethod_Electronic check  \
437                                      1                               0   

     PaymentMethod_Mailed check  
437                  

{'predictions': [{'score': 0.0025536122266203165}]}

In [17]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Collect predictions for all rows in X_test
preds = []
for i in range(len(X_test)):
    row = X_test.iloc[i:i+1]
    payload = ",".join([str(x) for x in row.values.flatten().tolist()])
    result = predictor.predict(payload)
    score = result['predictions'][0]['score']
    preds.append(score)

# Convert list to numpy
y_prob = np.array(preds)

# Convert probabilities to class labels (0/1)
y_pred = (y_prob > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
cm = confusion_matrix(y_test, y_pred)

accuracy, precision, recall, f1, auc, cm

(0.7785663591199432,
 0.5981012658227848,
 0.5053475935828877,
 0.5478260869565217,
 0.8153375184065721,
 array([[908, 127],
        [185, 189]]))