In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

1. Import the "Telecom_Customer_Churn.csv" dataset.

In [2]:
dataset_path = "tele_com.csv"
df = pd.read_csv(dataset_path)

2. Explore the dataset to understand its structure and content.

In [3]:
 print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        14 non-null     object 
 1   gender            14 non-null     object 
 2   SeniorCitizen     14 non-null     int64  
 3   Partner           14 non-null     object 
 4   Dependents        14 non-null     object 
 5   tenure            14 non-null     int64  
 6   PhoneService      14 non-null     object 
 7   MultipleLines     11 non-null     object 
 8   InternetService   14 non-null     object 
 9   OnlineSecurity    14 non-null     object 
 10  OnlineBackup      14 non-null     object 
 11  DeviceProtection  14 non-null     object 
 12  TechSupport       14 non-null     object 
 13  StreamingTV       14 non-null     object 
 14  StreamingMovies   14 non-null     object 
 15  Contract          14 non-null     object 
 16  PaperlessBilling  14 non-null     object 
 17 

3. Handle missing values in the dataset, deciding on an appropriate strategy.

In [4]:
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       3
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [5]:
df['MultipleLines'] = df['MultipleLines'].fillna('Not known').str.lower()

In [6]:
print(df)

    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService   
0   7590-VHVEG  Female              0     Yes         No       1           No  \
1   5575-GNVDE    Male              0      No         No      34          Yes   
2   3668-QPYBK    Male              0      No         No       2          Yes   
3   7795-CFOCW    Male              0      No         No      45           No   
4   9237-HQITU  Female              0      No         No       2          Yes   
5   9305-CDSKC  Female              0      No         No       8          Yes   
6   1452-KIOVK    Male              0      No        Yes      22          Yes   
7   6713-OKOMC  Female              0      No         No      10           No   
8   7892-POOKP  Female              0     Yes         No      28          Yes   
9   6388-TABGU    Male              0      No        Yes      62          Yes   
10  9763-GRSKD    Male              0     Yes        Yes      13          Yes   
11  7469-LKBCI    Male      

5. Check for inconsistent data, such as inconsistent formatting or spelling variations,
and standardize it.

In [7]:
df['InternetService'] = df['InternetService'].replace('Fiber opticalal', 'Fiber Optic')

In [8]:
print(df)

    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService   
0   7590-VHVEG  Female              0     Yes         No       1           No  \
1   5575-GNVDE    Male              0      No         No      34          Yes   
2   3668-QPYBK    Male              0      No         No       2          Yes   
3   7795-CFOCW    Male              0      No         No      45           No   
4   9237-HQITU  Female              0      No         No       2          Yes   
5   9305-CDSKC  Female              0      No         No       8          Yes   
6   1452-KIOVK    Male              0      No        Yes      22          Yes   
7   6713-OKOMC  Female              0      No         No      10           No   
8   7892-POOKP  Female              0     Yes         No      28          Yes   
9   6388-TABGU    Male              0      No        Yes      62          Yes   
10  9763-GRSKD    Male              0     Yes        Yes      13          Yes   
11  7469-LKBCI    Male      

6. Convert columns to the correct data types as needed

In [9]:
df['MonthlyCharges'] = df['MonthlyCharges'].astype(int)
df['TotalCharges'] = df['TotalCharges'].astype(int)

In [10]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customerID        14 non-null     object
 1   gender            14 non-null     object
 2   SeniorCitizen     14 non-null     int64 
 3   Partner           14 non-null     object
 4   Dependents        14 non-null     object
 5   tenure            14 non-null     int64 
 6   PhoneService      14 non-null     object
 7   MultipleLines     14 non-null     object
 8   InternetService   14 non-null     object
 9   OnlineSecurity    14 non-null     object
 10  OnlineBackup      14 non-null     object
 11  DeviceProtection  14 non-null     object
 12  TechSupport       14 non-null     object
 13  StreamingTV       14 non-null     object
 14  StreamingMovies   14 non-null     object
 15  Contract          14 non-null     object
 16  PaperlessBilling  14 non-null     object
 17  PaymentMethod     

7. Identify and handle outliers in the data.

In [11]:
import numpy as np
# Select numerical columns for Z-Score calculation
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Calculate Z-Scores for selected columns
z_scores = np.abs((df[numerical_cols] - df[numerical_cols].mean()) / df[numerical_cols].std())

# Define a threshold for outlier detection (e.g., Z-Score greater than 3)
threshold = 3

# Identify and print outliers
outliers = z_scores > threshold
print("Outliers in each column:")
print(outliers)

# Count outliers in each column
outlier_counts = outliers.sum()
print("\nNumber of outliers in each column:")
print(outlier_counts)

Outliers in each column:
    tenure  MonthlyCharges  TotalCharges
0    False           False         False
1    False           False         False
2    False           False         False
3    False           False         False
4    False           False         False
5    False           False         False
6    False           False         False
7    False           False         False
8    False           False         False
9    False           False         False
10   False           False         False
11   False           False         False
12   False           False         False
13   False           False         False

Number of outliers in each column:
tenure            0
MonthlyCharges    0
TotalCharges      0
dtype: int64


9.Normalize or scale the data if necessary.

In [12]:
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()

df[numerical_features] = scaler.fit_transform(df[numerical_features])

print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents    tenure   
0  7590-VHVEG  Female              0     Yes         No -1.168500  \
1  5575-GNVDE    Male              0      No         No  0.438187   
2  3668-QPYBK    Male              0      No         No -1.119812   
3  7795-CFOCW    Male              0      No         No  0.973750   
4  9237-HQITU  Female              0      No         No -1.119812   

  PhoneService MultipleLines InternetService OnlineSecurity  ...   
0           No     not known             DSL             No  ...  \
1          Yes            no             DSL            Yes  ...   
2          Yes            no             DSL            Yes  ...   
3           No     not known             DSL            Yes  ...   
4          Yes            no     Fiber Optic             No  ...   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract   
0               No          No          No              No  Month-to-month  \
1              Yes  

8.Perform feature engineering, creating new features that may be relevant to
predicting customer churn

In [13]:
df['Contract_Renewal'] = df['Contract'].apply(lambda x: 'Yes' if x in ['One year', 'Two year'] else 'No')

In [14]:
print(df['Contract_Renewal'])

0      No
1     Yes
2      No
3     Yes
4      No
5      No
6      No
7      No
8      No
9     Yes
10     No
11    Yes
12    Yes
13     No
Name: Contract_Renewal, dtype: object


10. Split the dataset into training and testing sets for further analysis.

In [15]:
X = df.drop(columns=['Churn'])
y = df['Churn']

# Split the dataset into training and testing sets (e.g., 70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the shapes of the training and testing sets to verify the split
print("Training set shape - X:", X_train.shape, "y:", y_train.shape)
print("Testing set shape - X:", X_test.shape, "y:", y_test.shape)

Training set shape - X: (9, 21) y: (9,)
Testing set shape - X: (5, 21) y: (5,)
