# Preprocessing

### Importing Lib files

In [13]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump
import matplotlib.pyplot as plt
import seaborn as sns

### Load Data

In [2]:
# Load train and test data
train_df = pd.read_csv("../data/customer_churn_dataset-testing-master.csv")
test_df = pd.read_csv("../data/customer_churn_dataset-training-master.csv")

train_df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1,22,Female,25,14,4,27,Basic,Monthly,598,9,1
1,2,41,Female,28,28,7,13,Standard,Monthly,584,20,0
2,3,47,Male,27,10,2,29,Premium,Annual,757,21,0
3,4,35,Male,9,12,5,17,Premium,Quarterly,232,18,0
4,5,53,Female,58,24,9,2,Standard,Annual,533,18,0


### Basic EDA

In [4]:
print("Train shape: ", train_df.head())
print("Test shape: ", test_df.head())

train_df.head()
train_df.describe()

Train shape:     CustomerID  Age  Gender  Tenure  Usage Frequency  Support Calls  \
0           1   22  Female      25               14              4   
1           2   41  Female      28               28              7   
2           3   47    Male      27               10              2   
3           4   35    Male       9               12              5   
4           5   53  Female      58               24              9   

   Payment Delay Subscription Type Contract Length  Total Spend  \
0             27             Basic         Monthly          598   
1             13          Standard         Monthly          584   
2             29           Premium          Annual          757   
3             17           Premium       Quarterly          232   
4              2          Standard          Annual          533   

   Last Interaction  Churn  
0                 9      1  
1                20      0  
2                21      0  
3                18      0  
4                

Unnamed: 0,CustomerID,Age,Tenure,Usage Frequency,Support Calls,Payment Delay,Total Spend,Last Interaction,Churn
count,64374.0,64374.0,64374.0,64374.0,64374.0,64374.0,64374.0,64374.0,64374.0
mean,32187.5,41.970982,31.994827,15.080234,5.40069,17.133952,541.023379,15.49885,0.473685
std,18583.317451,13.924911,17.098234,8.81647,3.114005,8.852211,260.874809,8.638436,0.499311
min,1.0,18.0,1.0,1.0,0.0,0.0,100.0,1.0,0.0
25%,16094.25,30.0,18.0,7.0,3.0,10.0,313.0,8.0,0.0
50%,32187.5,42.0,33.0,15.0,6.0,19.0,534.0,15.0,0.0
75%,48280.75,54.0,47.0,23.0,8.0,25.0,768.0,23.0,1.0
max,64374.0,65.0,60.0,30.0,10.0,30.0,1000.0,30.0,1.0


### Basic Cleaning

In [5]:
# Customer ID not necessary for the prediccation
train_df.drop('CustomerID', axis=1, inplace=True)
test_ids = test_df['CustomerID']  # keep for output
test_df.drop('CustomerID', axis=1, inplace=True)

In [6]:
# Store target
y = train_df['Churn']
X = train_df.drop('Churn', axis=1)

### Feature Types

In [7]:
categorical_cols = ['Gender', 'Subscription Type', 'Contract Length']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

### Preprocessing pipeline

In [8]:
# Impute numerical -> scale
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [9]:
# One-hot encode categoricals
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

# Full pipeline
full_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

### Fit on train and transform both

In [10]:
X_train = full_pipeline.fit_transform(X)
X_test = full_pipeline.transform(test_df)

print("X_train shape after preprocessing:", X_train.shape)
print("X_test shape after preprocessing :", X_test.shape)

X_train shape after preprocessing: (64374, 15)
X_test shape after preprocessing : (440833, 15)


### Save outputs

In [16]:
# Save the preprocessed data
np.save('../output/X_train.npy', X_train)
np.save('../output/y_train.npy', y)
np.save('../output/X_test.npy', X_test)
test_ids.to_csv('../output/test_ids.csv', index=False)

# Save the pipeline
joblib.dump(full_pipeline, '../output/preprocessing_pipeline.joblib')
print("Preprocessing pipeline saved.")

Preprocessing pipeline saved.
