In [1]:
from google.colab import files
import pandas as pd

# Upload the file
uploaded = files.upload()

# Load the dataset
data = pd.read_csv('Dataset.csv')  # Replace 'Dataset.csv' with the name of your uploaded file

# Display the first few rows of the dataset
data.head()


Saving Dataset.csv to Dataset.csv


Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,Contract,MonthlyCharges,Churn
0,Female,0,No,1,No,No,DSL,Month-to-month,29.85,No
1,Male,0,No,34,Yes,No,DSL,One year,56.95,No
2,Male,0,No,2,Yes,No,DSL,Month-to-month,53.85,Yes
3,Male,0,No,45,No,No,DSL,One year,42.3,No
4,Female,0,No,2,Yes,No,Fiber optic,Month-to-month,70.7,Yes


In [2]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()

# Display columns with missing values
missing_values[missing_values > 0]


Unnamed: 0,0


In [3]:
# Fill missing numerical values with the mean
data.fillna({
    'MonthlyCharges': data['MonthlyCharges'].mean(),
    'tenure': data['tenure'].mean()
}, inplace=True)

# Display the first few rows to confirm changes
data.head()


Unnamed: 0,gender,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,Contract,MonthlyCharges,Churn
0,Female,0,No,1,No,No,DSL,Month-to-month,29.85,No
1,Male,0,No,34,Yes,No,DSL,One year,56.95,No
2,Male,0,No,2,Yes,No,DSL,Month-to-month,53.85,Yes
3,Male,0,No,45,No,No,DSL,One year,42.3,No
4,Female,0,No,2,Yes,No,Fiber optic,Month-to-month,70.7,Yes


In [4]:
# Convert categorical variables to numeric using one-hot encoding
encoded_data = pd.get_dummies(data, drop_first=True)

# Display the first few rows of the encoded data
encoded_data.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,gender_Male,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_Fiber optic,Contract_One year,Contract_Two year,Churn_Yes
0,0,1,29.85,False,False,False,False,False,False,False,False
1,0,34,56.95,True,False,True,False,False,True,False,False
2,0,2,53.85,True,False,True,False,False,False,False,True
3,0,45,42.3,True,False,False,False,False,True,False,False
4,0,2,70.7,False,False,True,False,True,False,False,True


In [5]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = encoded_data.drop('Churn_Yes', axis=1)  # Replace 'Churn_Yes' with your target variable column
y = encoded_data['Churn_Yes']  # Replace 'Churn_Yes' with your target variable column

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((5634, 10), (1409, 10), (5634,), (1409,))

In [6]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled data back to a DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Display the first few rows of the scaled training data
X_train_scaled.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,gender_Male,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_Fiber optic,Contract_One year,Contract_Two year
0,-0.437749,-0.465683,-0.000474,-1.025166,1.532186,0.329573,-0.860523,-0.887057,1.910109,-0.558353
1,-0.437749,0.885537,1.074754,-1.025166,-0.652662,0.329573,1.162083,1.127324,-0.52353,1.790982
2,-0.437749,-1.284605,-1.376499,0.975452,-0.652662,-3.034225,-0.860523,-0.887057,-0.52353,-0.558353
3,-0.437749,-1.161766,0.177346,0.975452,-0.652662,0.329573,-0.860523,1.127324,-0.52353,-0.558353
4,-0.437749,-1.325551,-0.098524,0.975452,1.532186,0.329573,1.162083,-0.887057,-0.52353,1.790982


In [7]:
# Save the training and testing datasets
X_train_scaled.to_csv('X_train_scaled.csv', index=False)
X_test_scaled.to_csv('X_test_scaled.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

# Download the datasets
files.download('X_train_scaled.csv')
files.download('X_test_scaled.csv')
files.download('y_train.csv')
files.download('y_test.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>