In [25]:
# Required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

In [26]:
# Load your dataset
dataset = pd.read_csv('Dataset (ATS)-1 (1).csv')

In [27]:
# Check for missing values
print(dataset.isnull().sum())


gender             0
SeniorCitizen      0
Dependents         0
tenure             0
PhoneService       0
MultipleLines      0
InternetService    0
Contract           0
MonthlyCharges     0
Churn              0
dtype: int64


In [28]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gender           7043 non-null   object 
 1   SeniorCitizen    7043 non-null   int64  
 2   Dependents       7043 non-null   object 
 3   tenure           7043 non-null   int64  
 4   PhoneService     7043 non-null   object 
 5   MultipleLines    7043 non-null   object 
 6   InternetService  7043 non-null   object 
 7   Contract         7043 non-null   object 
 8   MonthlyCharges   7043 non-null   float64
 9   Churn            7043 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 550.4+ KB


In [29]:
# Encoding categorical columns using LabelEncoder for binary categories
label_encoder = LabelEncoder()
dataset['gender'] = label_encoder.fit_transform(dataset['gender'])  # 0: Female, 1: Male
dataset['Dependents'] = label_encoder.fit_transform(dataset['Dependents'])  # 0: No, 1: Yes
dataset['PhoneService'] = label_encoder.fit_transform(dataset['PhoneService'])  # 0: No, 1: Yes
dataset['MultipleLines'] = label_encoder.fit_transform(dataset['MultipleLines'])  # 0: No, 1: Yes
dataset['Churn'] = label_encoder.fit_transform(dataset['Churn'])  # 0: No, 1: Yes


In [30]:
# One-hot encoding for columns with more than two categories (e.g., InternetService, Contract)
dataset = pd.get_dummies(dataset, columns=['InternetService', 'Contract'], drop_first=True)

In [31]:
# Splitting dataset into features and target variable
X = dataset.drop('Churn', axis=1)  # Features excluding 'Churn'
y = dataset['Churn']  # Target column 'Churn'

In [32]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Scaling the numerical columns ('tenure', 'MonthlyCharges') using StandardScaler
scaler = StandardScaler()
X_train[['tenure', 'MonthlyCharges']] = scaler.fit_transform(X_train[['tenure', 'MonthlyCharges']])
X_test[['tenure', 'MonthlyCharges']] = scaler.transform(X_test[['tenure', 'MonthlyCharges']])

In [34]:
# Saving the preprocessed datasets to CSV
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)