In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('customer_churn_data.csv')

In [20]:
df.shape

(5880, 33)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5880 entries, 0 to 5879
Data columns (total 33 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   customerID                            5880 non-null   object 
 1   SeniorCitizen                         5880 non-null   bool   
 2   tenure                                5880 non-null   float64
 3   MonthlyCharges                        5880 non-null   float64
 4   TotalCharges                          5880 non-null   float64
 5   Churn                                 5880 non-null   int64  
 6   gender_Male                           5880 non-null   bool   
 7   Partner_Yes                           5880 non-null   bool   
 8   Dependents_Yes                        5880 non-null   bool   
 9   PhoneService_Yes                      5880 non-null   bool   
 10  MultipleLines_No phone service        5880 non-null   bool   
 11  MultipleLines_Yes

In [21]:
df.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check,MultipleServices
0,CUST0000,False,-0.64804,-0.705078,-0.74365,0,True,False,True,False,...,False,True,False,False,False,True,False,False,False,6
1,CUST0001,False,0.308537,1.060414,0.923257,1,False,True,False,False,...,True,False,False,False,False,False,False,False,True,0
2,CUST0002,True,0.691167,0.943409,1.255078,1,True,False,False,True,...,False,False,False,True,False,False,False,True,False,0
3,CUST0003,True,1.695572,1.084023,2.477961,0,True,False,False,True,...,False,False,False,False,False,False,True,False,False,0
4,CUST0004,True,-0.552382,-0.622793,-0.660423,1,True,False,False,True,...,False,False,True,False,False,False,False,True,False,0


# Handle missing values

In [None]:
# Checking for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
customerID                              0
SeniorCitizen                           0
tenure                                  0
MonthlyCharges                          0
TotalCharges                            0
Churn                                   0
gender_Male                             0
Partner_Yes                             0
Dependents_Yes                          0
PhoneService_Yes                        0
MultipleLines_No phone service          0
MultipleLines_Yes                       0
InternetService_Fiber optic             0
InternetService_No                      0
OnlineSecurity_No internet service      0
OnlineSecurity_Yes                      0
OnlineBackup_No internet service        0
OnlineBackup_Yes                        0
DeviceProtection_No internet service    0
DeviceProtection_Yes                    0
TechSupport_No internet service         0
TechSupport_Yes                         0
StreamingTV_No internet service         0
Str

In [None]:
# Fill missing values in 'TotalCharges' with the median
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Transform data types

In [None]:
# Convert 'SeniorCitizen' to boolean
df['SeniorCitizen'] = df['SeniorCitizen'].astype(bool)

# Normalize/standardize numerical features

In [None]:
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Encode categorical features

In [None]:
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                        'PaperlessBilling', 'PaymentMethod']

In [None]:
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Encode the target variable 'Churn'

In [None]:
label_encoder = LabelEncoder()
df['Churn'] = label_encoder.fit_transform(df['Churn'])

In [None]:
# Print the column names to ensure correct references
print("Column names after encoding:")
print(df.columns)

Column names after encoding:
Index(['customerID', 'SeniorCitizen', 'tenure', 'MonthlyCharges',
       'TotalCharges', 'Churn', 'gender_Male', 'Partner_Yes', 'Dependents_Yes',
       'PhoneService_Yes', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No internet service',
       'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card', 'PaymentMethod_Electronic check',
       'PaymentMethod_Mailed check'],
      dtype='object')


# Feature engineering

In [None]:
# Creating a feature indicating whether the customer has multiple services
service_columns = [col for col in df.columns if any(service in col for service in ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']) and 'No' in col]
df['MultipleServices'] = df[service_columns].sum(axis=1)

In [None]:
# Splitting the data into training and testing sets
X = df.drop(columns=['customerID', 'Churn'])
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Save the processed data
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)