In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
# Load the dataset
df = pd.read_csv("telco.csv")
df.head()

Unnamed: 0,Customer ID,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents,Country,State,...,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Satisfaction Score,Customer Status,Churn Label,Churn Score,CLTV,Churn Category,Churn Reason
0,8779-QRDMV,Male,78,No,Yes,No,No,0,United States,California,...,20,0.0,59.65,3,Churned,Yes,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,Female,74,No,Yes,Yes,Yes,1,United States,California,...,0,390.8,1024.1,3,Churned,Yes,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,Male,71,No,Yes,No,Yes,3,United States,California,...,0,203.94,1910.88,2,Churned,Yes,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,Female,78,No,Yes,Yes,Yes,1,United States,California,...,0,494.0,2995.07,2,Churned,Yes,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,Female,80,No,Yes,Yes,Yes,1,United States,California,...,0,234.21,3102.36,2,Churned,Yes,67,2793,Price,Extra data charges


In [3]:
# Selecting relevant features
features = ['Gender', 'Age', 'Tenure in Months', 'Contract', 'Monthly Charge', 'Total Revenue']
target = 'Churn Label'

df = df[features + [target]]

In [4]:
# Encode categorical variables
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Contract'] = le.fit_transform(df['Contract'])
df[target] = le.fit_transform(df[target])  # Yes -> 1, No -> 0

In [5]:
# Split dataset into training (70%), validation (15%), and test (15%)
X = df.drop(columns=[target])
y = df[target]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [6]:
# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [7]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [8]:
# Evaluate on validation set
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

Validation Accuracy: 0.8153409090909091
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.88       776
           1       0.69      0.55      0.61       280

    accuracy                           0.82      1056
   macro avg       0.77      0.73      0.75      1056
weighted avg       0.81      0.82      0.81      1056



In [9]:
# Save the trained model and scaler
joblib.dump(model, "telco_model.joblib")
joblib.dump(scaler, "telco_scaler.joblib")
joblib.dump(le, "telco_encoder.joblib")


['telco_encoder.joblib']

In [10]:
# Saving Data to Database
from sqlalchemy import create_engine

host = r'127.0.0.1' # denotes that the db in a local installation
db = r'MSDS610' # db we just created
user = r'postgres' # using the postgres user for this demo
pw = r'your_new_password' # this is the password established during installation
port = r'5432' # default port estabalished during install
schema = r'telco' # schema we just created


In [18]:
import psycopg2

conn = psycopg2.connect(
    dbname="MSDS610",
    user="postgres",
    password="your_new_password",
    host="localhost",
    port="5432"
)

cur = conn.cursor()
cur.execute("SELECT * FROM raw_data LIMIT 5;")

for row in cur.fetchall():
    print(row)

cur.close()
conn.close()


(1, 'Hyde Park - Walk to UChicago, 10 min to McCormick', 'Hyde Park', Decimal('60'))
(2, '394 Great Reviews. 127 y/o House. 40 yds to train.', 'South Lawndale', Decimal('105'))
(3, 'Tiny Studio Apartment 94 Walk Score', 'West Town', Decimal('60'))
(4, "Barbara's Hideaway - Old Town", 'Lincoln Park', Decimal('65'))
(5, '3 Comforts of Cooperative Living', 'Hyde Park', Decimal('21'))
