# Data Ingestion

In [64]:
import pandas as pd
import kagglehub
import shutil
import os
import warnings
warnings.filterwarnings('ignore')


def data_ingestion(link):
    path = kagglehub.dataset_download(link)

    custom_path = "../data"
    os.makedirs(custom_path, exist_ok=True)

    for filename in os.listdir(path):
        shutil.copy(os.path.join(path, filename), os.path.join(custom_path, filename))

    print("Files copied to:", custom_path)

    loaded_df = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
    print("Data loaded in the data frame")

    return loaded_df

# Data Preprocessing

In [73]:
import pandas as pd

def preprocess_data(df):
    cols_to_keep = ['gender', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']
    preprocessed_df = df[cols_to_keep].copy()

    preprocessed_df['TotalCharges'] = pd.to_numeric(preprocessed_df['TotalCharges'], errors='coerce')
    preprocessed_df['MonthlyCharges'] = pd.to_numeric(preprocessed_df['MonthlyCharges'], errors='coerce')
    preprocessed_df.dropna(inplace=True)
    preprocessed_df = preprocessed_df.drop_duplicates()

    preprocessed_df['gender'] = preprocessed_df['gender'].map({'Male': 0, 'Female': 1})
    preprocessed_df['Churn'] = preprocessed_df['Churn'].map({'No': 0, 'Yes': 1})

    print("Data preprocessed")
    return preprocessed_df

In [76]:
from sklearn.preprocessing import StandardScaler
import joblib


def standardize_data(train_df, test_df):
    cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']

    for col in cols_to_scale:
        train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
        test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

    train_df.dropna(subset=cols_to_scale, inplace=True)
    test_df.dropna(subset=cols_to_scale, inplace=True)

    scaler = StandardScaler()

    train_scaled = scaler.fit_transform(train_df[cols_to_scale])
    test_scaled = scaler.transform(test_df[cols_to_scale])

    joblib.dump(scaler, "../models/standard_scaler.pkl")
    train_df[cols_to_scale] = pd.DataFrame(train_scaled, columns=cols_to_scale, index=train_df.index)
    test_df[cols_to_scale] = pd.DataFrame(test_scaled, columns=cols_to_scale, index=test_df.index)

    print("Data scaled")
    return train_df, test_df


## Main

In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split

def main():
    try:
        df = data_ingestion("blastchar/telco-customer-churn")
        preprocessed_df = preprocess_data(df)
        train_data, test_data = train_test_split(preprocessed_df, test_size=0.2, random_state=42)
        train_scaled_df, test_scaled_df = standardize_data(train_data, test_data)
        save_data(train_scaled_df, "preprocessed_train.csv")
        save_data(test_scaled_df, "preprocessed_test.csv")
    except Exception as e:
        print(f"Error: {e}")
        print("Failed to complete the data ingestion process.")

if __name__ == '__main__':
    main()

Files copied to: ../data
Data loaded in the data frame
Data preprocessed
Data scaled
preprocessed_train.csv saved
preprocessed_test.csv saved


# Feature Engineering

In [84]:
import pandas as pd

train_df = pd.read_csv("../data/preprocessed_train.csv")
test_df = pd.read_csv("../data/preprocessed_test.csv")

train_df["total_charges_per_tenure"] = train_df["TotalCharges"] / train_df["tenure"]
test_df["total_charges_per_tenure"] = test_df["TotalCharges"] / test_df["tenure"]

save_data(train_df, "final_train.csv")
save_data(test_df, "final_test.csv")

final_train.csv saved
final_test.csv saved


In [88]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestRegressor

train_df = pd.read_csv('../data/final_train.csv')

y_train = train_df['Churn']
X_train = train_df.drop('Churn', axis=1)

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

joblib.dump(rf_model, "../models/rf_model.pkl")

['../models/rf_model.pkl']

In [3]:
import joblib
from sklearn.metrics import mean_absolute_error, r2_score
import json
import pandas as pd


test_df = pd.read_csv('../data/featurized/final_test.csv')

y_test = test_df['Churn']
X_test = test_df.drop('Churn', axis=1)

rf_model = joblib.load("../models/rf_model.pkl")

y_pred = rf_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Absolute Error: 0.2904
R² Score: 0.1239
