In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# Load the data again to ensure consistency
data_preprocessing = pd.read_csv('../data/raw_data/TelcoCustomerChurn.csv')

# Dropping rows with missing or malformed 'TotalCharges' entries
data_preprocessing['TotalCharges'] = pd.to_numeric(data_preprocessing['TotalCharges'], errors='coerce')
data_preprocessing = data_preprocessing.dropna(subset=['TotalCharges'])

# Define features and target variable
X = data_preprocessing.drop(columns=['Churn', 'customerID'])  # Dropping 'customerID' as it's not a useful feature
y = data_preprocessing['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)  # Encoding target variable

# Define numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Standardize numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply transformations
X_preprocessed = preprocessor.fit_transform(X)
feature_names = numerical_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(input_features=categorical_features))

# Create a DataFrame with the preprocessed data
data_preprocessed = pd.DataFrame(X_preprocessed, columns=feature_names)
data_preprocessed['Churn'] = y.values

data_preprocessed['CustomerID'] = data_preprocessing['customerID'].values

# Save the preprocessed data to a CSV file
data_preprocessed.to_csv('../data/preprocessed_data/TelcoCustomerChurn_Preprocessed.csv', index=False)

# Display sample data and file path
data_preprocessed.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn,CustomerID
0,-0.440327,-1.280248,-1.161694,-0.994194,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,7590-VHVEG
1,-0.440327,0.064303,-0.260878,-0.17374,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,5575-GNVDE
2,-0.440327,-1.239504,-0.363923,-0.959649,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,3668-QPYBK
3,-0.440327,0.512486,-0.74785,-0.195248,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,7795-CFOCW
4,-0.440327,-1.239504,0.196178,-0.940457,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1,9237-HQITU
