In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sqlite3
import matplotlib.pyplot as plt

churn_df = pd.read_csv("../Data_prepration/processed_data/clean_hf.csv")
telco_df = pd.read_csv("../Data_prepration/processed_data/clean_kaggle.csv")

In [None]:
print("Data loaded successfully!")
print("Churn Modelling shape:", churn_df.shape)
print("Telco Churn shape:", telco_df.shape)

print("Churn Modelling columns:")
print(churn_df.columns.tolist())
print("Telco Churn columns:")
print(telco_df.columns.tolist())

In [None]:

# ----- Feature Engineering for Churn Modelling -----
if {"MonthlyCharges", "tenure"}.issubset(churn_df.columns):
    churn_df["TotalSpend"] = churn_df["MonthlyCharges"] * churn_df["tenure"]

    churn_df["AvgMonthlySpend"] = np.where(
        churn_df["tenure"] > 0,
        churn_df["TotalSpend"] / churn_df["tenure"],
        churn_df["MonthlyCharges"]
    )

    churn_df["TenureGroup"] = pd.cut(
        churn_df["tenure"],
        bins=[0, 12, 24, 48, 72],
        labels=["0-1yr", "1-2yr", "2-4yr", "4-6yr"]
    )

    # Example: Activity frequency (if NumOfProducts exists)
    if "NumOfProducts" in churn_df.columns:
        churn_df["ActivityFrequency"] = churn_df["NumOfProducts"] / (churn_df["tenure"] + 1)

# ----- Feature Engineering for Telco Churn -----
if {"Monthly Charge", "Tenure in Months"}.issubset(telco_df.columns):
    telco_df["TotalSpend"] = telco_df["Monthly Charge"] * telco_df["Tenure in Months"]

    telco_df["AvgMonthlySpend"] = np.where(
        telco_df["Tenure in Months"] > 0,
        telco_df["TotalSpend"] / telco_df["Tenure in Months"],
        telco_df["Monthly Charge"]
    )

    telco_df["TenureGroup"] = pd.cut(
        telco_df["Tenure in Months"],
        bins=[0, 12, 24, 48, 72],
        labels=["0-1yr", "1-2yr", "2-4yr", "4-6yr"]
    )

    # Example: Activity frequency (if Number of Referrals exists)
    if "Number of Referrals" in telco_df.columns:
        telco_df["ActivityFrequency"] = telco_df["Number of Referrals"] / (telco_df["Tenure in Months"] + 1)

print("Feature engineering applied successfully!")
print("Churn Modelling new cols:", [col for col in churn_df.columns if col in ["TotalSpend","AvgMonthlySpend","TenureGroup","ActivityFrequency"]])
print("Telco Churn new cols:", [col for col in telco_df.columns if col in ["TotalSpend","AvgMonthlySpend","TenureGroup","ActivityFrequency"]])


In [None]:
scaler = StandardScaler()

numeric_cols_telco = telco_df.select_dtypes(include=[np.number]).columns
telco_df[numeric_cols_telco] = scaler.fit_transform(telco_df[numeric_cols_telco])

numeric_cols_churn = churn_df.select_dtypes(include=[np.number]).columns
churn_df[numeric_cols_churn] = scaler.fit_transform(churn_df[numeric_cols_churn])

print("Feature scaling done!")

In [None]:
conn = sqlite3.connect("customer_churn.db")

# Save transformed datasets into SQL tables
churn_df.to_sql("ChurnModelling", conn, if_exists="replace", index=False)
telco_df.to_sql("TelcoChurn", conn, if_exists="replace", index=False)

print(" Data stored in SQLite DB!")

In [None]:
schema_churn = """
CREATE TABLE ChurnModelling (
    RowNumber INTEGER,
    CustomerId INTEGER,
    Surname TEXT,
    CreditScore REAL,
    Geography TEXT,
    Gender TEXT,
    Age REAL,
    Tenure REAL,
    Balance REAL,
    NumOfProducts REAL,
    HasCrCard REAL,
    IsActiveMember REAL,
    EstimatedSalary REAL,
    Exited REAL
);
"""


schema_telco = """
CREATE TABLE TelcoChurn (
    customerID TEXT PRIMARY KEY,
    gender TEXT,
    SeniorCitizen INTEGER,
    Partner TEXT,
    Dependents TEXT,
    tenure REAL,
    PhoneService TEXT,
    MultipleLines TEXT,
    InternetService TEXT,
    OnlineSecurity TEXT,
    OnlineBackup TEXT,
    DeviceProtection TEXT,
    TechSupport TEXT,
    StreamingTV TEXT,
    StreamingMovies TEXT,
    Contract TEXT,
    PaperlessBilling TEXT,
    PaymentMethod TEXT,
    MonthlyCharges REAL,
    TotalCharges REAL,
    Churn TEXT
);
"""


In [None]:
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS ChurnModelling;")
cursor.execute("DROP TABLE IF EXISTS TelcoChurn;")
cursor.execute(schema_churn)
cursor.execute(schema_telco)
conn.commit()

In [None]:
churn_df.to_sql("ChurnModelling", conn, if_exists="replace", index=False)
telco_df.to_sql("TelcoChurn", conn, if_exists="replace", index=False)


In [None]:
# 5. SAMPLE SQL QUERIES
print("\n Running sample queries:")

# Query 1: Average spend per tenure group (Telco)
query1 = pd.read_sql("""
SELECT TenureGroup, AVG(AvgMonthlySpend) as AvgSpend
FROM TelcoChurn
GROUP BY TenureGroup;
""", conn)
print(query1)

# Query 2: Churn rate by geography (ChurnModelling)
query2 = pd.read_sql("""
SELECT  AVG(Partner) as ChurnRate
FROM ChurnModelling;
""", conn)
print(query2)

# Close connection
conn.close()