In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sqlite3
import matplotlib.pyplot as plt

churn_df = pd.read_csv("../Task4_DataPreparation/processed_data/clean_hf.csv")
telco_df = pd.read_csv("../Task4_DataPreparation/processed_data/clean_kaggle.csv")

In [2]:
print("Data loaded successfully!")
print("Churn Modelling shape:", churn_df.shape)
print("Telco Churn shape:", telco_df.shape)

print("Churn Modelling columns:")
print(churn_df.columns.tolist())
print("Telco Churn columns:")
print(telco_df.columns.tolist())

Data loaded successfully!
Churn Modelling shape: (1409, 52)
Telco Churn shape: (7043, 21)
Churn Modelling columns:
['Age', 'Avg Monthly GB Download', 'Avg Monthly Long Distance Charges', 'Churn Category', 'Churn Reason', 'Churn Score', 'City', 'CLTV', 'Contract', 'Country', 'Customer ID', 'Customer Status', 'Dependents', 'Device Protection Plan', 'Gender', 'Internet Service', 'Internet Type', 'Lat Long', 'Latitude', 'Longitude', 'Married', 'Monthly Charge', 'Multiple Lines', 'Number of Dependents', 'Number of Referrals', 'Offer', 'Online Backup', 'Online Security', 'Paperless Billing', 'Partner', 'Payment Method', 'Phone Service', 'Population', 'Premium Tech Support', 'Quarter', 'Referred a Friend', 'Satisfaction Score', 'Senior Citizen', 'State', 'Streaming Movies', 'Streaming Music', 'Streaming TV', 'Tenure in Months', 'Total Charges', 'Total Extra Data Charges', 'Total Long Distance Charges', 'Total Refunds', 'Total Revenue', 'Under 30', 'Unlimited Data', 'Zip Code', 'Churn']
Telco 

In [3]:

# ----- Feature Engineering for Churn Modelling -----
if {"MonthlyCharges", "tenure"}.issubset(churn_df.columns):
    churn_df["TotalSpend"] = churn_df["MonthlyCharges"] * churn_df["tenure"]

    churn_df["AvgMonthlySpend"] = np.where(
        churn_df["tenure"] > 0,
        churn_df["TotalSpend"] / churn_df["tenure"],
        churn_df["MonthlyCharges"]
    )

    churn_df["TenureGroup"] = pd.cut(
        churn_df["tenure"],
        bins=[0, 12, 24, 48, 72],
        labels=["0-1yr", "1-2yr", "2-4yr", "4-6yr"]
    )

    # Example: Activity frequency (if NumOfProducts exists)
    if "NumOfProducts" in churn_df.columns:
        churn_df["ActivityFrequency"] = churn_df["NumOfProducts"] / (churn_df["tenure"] + 1)

# ----- Feature Engineering for Telco Churn -----
if {"Monthly Charge", "Tenure in Months"}.issubset(telco_df.columns):
    telco_df["TotalSpend"] = telco_df["Monthly Charge"] * telco_df["Tenure in Months"]

    telco_df["AvgMonthlySpend"] = np.where(
        telco_df["Tenure in Months"] > 0,
        telco_df["TotalSpend"] / telco_df["Tenure in Months"],
        telco_df["Monthly Charge"]
    )

    telco_df["TenureGroup"] = pd.cut(
        telco_df["Tenure in Months"],
        bins=[0, 12, 24, 48, 72],
        labels=["0-1yr", "1-2yr", "2-4yr", "4-6yr"]
    )

    # Example: Activity frequency (if Number of Referrals exists)
    if "Number of Referrals" in telco_df.columns:
        telco_df["ActivityFrequency"] = telco_df["Number of Referrals"] / (telco_df["Tenure in Months"] + 1)

print("Feature engineering applied successfully!")
print("Churn Modelling new cols:", [col for col in churn_df.columns if col in ["TotalSpend","AvgMonthlySpend","TenureGroup","ActivityFrequency"]])
print("Telco Churn new cols:", [col for col in telco_df.columns if col in ["TotalSpend","AvgMonthlySpend","TenureGroup","ActivityFrequency"]])


Feature engineering applied successfully!
Churn Modelling new cols: []
Telco Churn new cols: []


In [4]:
scaler = StandardScaler()

numeric_cols_telco = telco_df.select_dtypes(include=[np.number]).columns
telco_df[numeric_cols_telco] = scaler.fit_transform(telco_df[numeric_cols_telco])

numeric_cols_churn = churn_df.select_dtypes(include=[np.number]).columns
churn_df[numeric_cols_churn] = scaler.fit_transform(churn_df[numeric_cols_churn])

print("Feature scaling done!")

Feature scaling done!


In [5]:
conn = sqlite3.connect("customer_churn.db")

# Save transformed datasets into SQL tables
churn_df.to_sql("ChurnModelling", conn, if_exists="replace", index=False)
telco_df.to_sql("TelcoChurn", conn, if_exists="replace", index=False)

print(" Data stored in SQLite DB!")

 Data stored in SQLite DB!


In [6]:
schema_churn = """
CREATE TABLE ChurnModelling (
    RowNumber INTEGER,
    CustomerId INTEGER,
    Surname TEXT,
    CreditScore REAL,
    Geography TEXT,
    Gender TEXT,
    Age REAL,
    Tenure REAL,
    Balance REAL,
    NumOfProducts REAL,
    HasCrCard REAL,
    IsActiveMember REAL,
    EstimatedSalary REAL,
    Exited REAL
);
"""


schema_telco = """
CREATE TABLE TelcoChurn (
    customerID TEXT PRIMARY KEY,
    gender TEXT,
    SeniorCitizen INTEGER,
    Partner TEXT,
    Dependents TEXT,
    tenure REAL,
    PhoneService TEXT,
    MultipleLines TEXT,
    InternetService TEXT,
    OnlineSecurity TEXT,
    OnlineBackup TEXT,
    DeviceProtection TEXT,
    TechSupport TEXT,
    StreamingTV TEXT,
    StreamingMovies TEXT,
    Contract TEXT,
    PaperlessBilling TEXT,
    PaymentMethod TEXT,
    MonthlyCharges REAL,
    TotalCharges REAL,
    Churn TEXT
);
"""


In [7]:
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS ChurnModelling;")
cursor.execute("DROP TABLE IF EXISTS TelcoChurn;")
cursor.execute(schema_churn)
cursor.execute(schema_telco)
conn.commit()

In [8]:
churn_df.to_sql("ChurnModelling", conn, if_exists="replace", index=False)
telco_df.to_sql("TelcoChurn", conn, if_exists="replace", index=False)


7043

In [9]:
# 5. SAMPLE SQL QUERIES
print("\n Running sample queries:")

# Query 1: Average spend per tenure group (Telco)
query1 = pd.read_sql("""
SELECT 
    CASE 
        WHEN tenure BETWEEN 0 AND 12 THEN '0-12'
        WHEN tenure BETWEEN 13 AND 24 THEN '13-24'
        WHEN tenure BETWEEN 25 AND 36 THEN '25-36'
        WHEN tenure BETWEEN 37 AND 48 THEN '37-48'
        WHEN tenure BETWEEN 49 AND 60 THEN '49-60'
        WHEN tenure BETWEEN 61 AND 72 THEN '61-72'
    END AS TenureGroup,
    AVG(MonthlyCharges) AS AvgSpend
FROM TelcoChurn
GROUP BY TenureGroup;
""", conn)
print(query1)

# Query 2: Churn rate by geography (ChurnModelling)
query2 = pd.read_sql("""
SELECT AVG(Churn) AS ChurnRate
FROM ChurnModelling;
""", conn)
print(query2)

# Close connection
conn.close()


 Running sample queries:
  TenureGroup  AvgSpend
0        None -0.193330
1        0-12  0.223323
      ChurnRate
0  3.088768e-17
