In [1]:
# ensure cwd is project root
import os
root = r"C:\Users\Dell\Documents\Customer-Churn-Prediction"
if os.getcwd() != root:
    os.chdir(root)
print("cwd:", os.getcwd())


cwd: C:\Users\Dell\Documents\Customer-Churn-Prediction


In [2]:
# dataset check
from pathlib import Path
p = Path("data")/"raw"
print("expected dataset path:", p.resolve())
print("exists?", p.exists())
if p.exists():
    files = list(p.iterdir())
    print("files in data/raw:")
    for f in files:
        print("-", repr(f.name), f.stat().st_size, "bytes")
else:
    print("data/raw not found. Create data/raw and place churn_modelling.csv inside.")

expected dataset path: C:\Users\Dell\Documents\Customer-Churn-Prediction\Data\raw
exists? True
files in data/raw:
- 'Churn_Modelling.csv' 684858 bytes


In [3]:
# quick load first rows & shape
import pandas as pd
from pathlib import Path

data_path = Path("data")/"raw"/"churn_modelling.csv"
if not data_path.exists():
    raise FileNotFoundError(f"{data_path} not found. Put dataset at this path.")
df = pd.read_csv(data_path)
print("rows, cols:", df.shape)
display(df.head(5))

rows, cols: (10000, 14)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
#  separate X, y and inspect numeric/categorical
if "Exited" not in df.columns:
    raise ValueError("CSV must contain 'Exited' target column.")

y = df["Exited"].astype(int)
X = df.drop(columns=["RowNumber","CustomerId","Surname","Exited"], errors="ignore")
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)
display(X.describe().T)

Numeric cols: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
Categorical cols: ['Geography', 'Gender']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
Balance,10000.0,76485.889288,62397.405202,0.0,0.0,97198.54,127644.24,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000.0,100090.239881,57510.492818,11.58,51002.11,100193.915,149388.2475,199992.48


In [6]:
import sklearn; print(sklearn.__version__)

1.7.2


In [8]:
# construct transformers
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preproc = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])
print("Preprocessing pipeline ready.")

Preprocessing pipeline ready.
