<a href="https://colab.research.google.com/github/stoyanovgeorge/customer_churn_prediction/blob/main/churn_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Telecom Churn Prediction

Reference: [Kaggle Dataset](https://www.kaggle.com/datasets/shilongzhuang/telecom-customer-churn-by-maven-analytics)

In [1]:
import pandas as pd


## Setting Up KaggleHub: Installation & Import
[How to Use Kaggle](https://www.kaggle.com/docs/api)

In [18]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
from kagglehub import dataset_download

## Loading the Maven Analytics Dataset
[Dataset Source](https://www.kaggle.com/datasets/shilongzhuang/telecom-customer-churn-by-maven-analytics)

In [72]:
from kagglehub import KaggleDatasetAdapter
import kagglehub

df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "shilongzhuang/telecom-customer-churn-by-maven-analytics",
    "telecom_customer_churn.csv"
)

df.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        7043 non-null   object 
 1   Gender                             7043 non-null   object 
 2   Age                                7043 non-null   int64  
 3   Married                            7043 non-null   object 
 4   Number of Dependents               7043 non-null   int64  
 5   City                               7043 non-null   object 
 6   Zip Code                           7043 non-null   int64  
 7   Latitude                           7043 non-null   float64
 8   Longitude                          7043 non-null   float64
 9   Number of Referrals                7043 non-null   int64  
 10  Tenure in Months                   7043 non-null   int64  
 11  Offer                              3166 non-null   objec

In [74]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,7043.0,46.509726,16.750352,19.0,32.0,46.0,60.0,80.0
Number of Dependents,7043.0,0.468692,0.962802,0.0,0.0,0.0,0.0,9.0
Zip Code,7043.0,93486.070567,1856.767505,90001.0,92101.0,93518.0,95329.0,96150.0
Latitude,7043.0,36.197455,2.468929,32.555828,33.990646,36.205465,38.161321,41.962127
Longitude,7043.0,-119.756684,2.154425,-124.301372,-121.78809,-119.595293,-117.969795,-114.192901
Number of Referrals,7043.0,1.951867,3.001199,0.0,0.0,0.0,3.0,11.0
Tenure in Months,7043.0,32.386767,24.542061,1.0,9.0,29.0,55.0,72.0
Avg Monthly Long Distance Charges,6361.0,25.420517,14.200374,1.01,13.05,25.69,37.68,49.99
Avg Monthly GB Download,5517.0,26.189958,19.586585,2.0,13.0,21.0,30.0,85.0
Monthly Charge,7043.0,63.596131,31.204743,-10.0,30.4,70.05,89.75,118.75


## Load the Population DF
The file `telecom_zipcode_population.csv` contains population data corresponding to individual ZIP codes.

In [75]:
population_df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "shilongzhuang/telecom-customer-churn-by-maven-analytics",
    "telecom_zipcode_population.csv"
)

population_df.head()

Unnamed: 0,Zip Code,Population
0,90001,54492
1,90002,44586
2,90003,58198
3,90004,67852
4,90005,43019


## Renaming the Columns
Converting all column names to lower letters and replacing spaces with underscores.

In [76]:
df = df.rename(lambda x: x.lower().strip().replace(' ', '_'), axis='columns')

# Converting all column names to lower letters and replacing spaces with underscores.
population_df = population_df.rename(
    lambda x: x.lower().strip().replace(' ', '_'),
    axis='columns'
)

df.head()

Unnamed: 0,customer_id,gender,age,married,number_of_dependents,city,zip_code,latitude,longitude,number_of_referrals,...,payment_method,monthly_charge,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,customer_status,churn_category,churn_reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [77]:
population_df.head()

Unnamed: 0,zip_code,population
0,90001,54492
1,90002,44586
2,90003,58198
3,90004,67852
4,90005,43019


## Replacing `zip_code` Column With `population` Data
Since population is a more informative feature for the model, I will replace the `zip_code` column with the `population` value associated with each ZIP code.

In [78]:
# Merge df with population_df on 'zip_code'
df = df.merge(population_df, on='zip_code', how='left')

df.head()

Unnamed: 0,customer_id,gender,age,married,number_of_dependents,city,zip_code,latitude,longitude,number_of_referrals,...,monthly_charge,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,customer_status,churn_category,churn_reason,population
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,65.6,593.3,0.0,0,381.51,974.81,Stayed,,,4498
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,,31297
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices,62069
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction,46677
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability,42853


## Cleaning up the DataFrame
As part of the preprocessing step, we need to remove columns that are not useful or potentially harmful for the churn prediction task. Specifically, the following columns are dropped:

  1. `customer_id` – A unique identifier that provides no predictive value; it doesn't contain any pattern related to churn.
  2. `latitude` & `longitude` – While they represent geographic information, raw coordinates typically offer limited predictive value. In this case, they are excluded to reduce noise, and because they are redundant to the `zip_code` and the `city` columns.
  3. `churn_category` - This is a post-churn diagnostic label describing the churn category.  
  4. `churn_reason` - This is a post-churn diagnostic label describing why the customer has decided to leave the Telecom.

By removing these features, we ensure that the model learns from generalizable, predictive features without relying on identifiers or outcome-dependent information.

In [79]:
df = df.drop(columns = [
    "customer_id",
    "latitude",
    "longitude",
    "churn_category",
    "churn_reason",
    "zip_code"
])

# Checking the number of columns
len(df.columns)

33

## Converting the Gender & Married Columns using Label Encoder

In [80]:
# Importing the Label Encoder
from sklearn.preprocessing import LabelEncoder

In [81]:
columns_to_encode = [
    "gender",
    "married",
    "phone_service",
    "multiple_lines",
    "internet_service",
    "online_security",
    "online_backup",
    "device_protection_plan",
    "premium_tech_support",
    "streaming_tv",
    "streaming_movies",
    "streaming_music",
    "unlimited_data",
    "paperless_billing",
    "payment_method"
]

# Encode each column with a LabelEncoder instance
for col in columns_to_encode:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])

df.head()

Unnamed: 0,gender,age,married,number_of_dependents,city,number_of_referrals,tenure_in_months,offer,phone_service,avg_monthly_long_distance_charges,...,paperless_billing,payment_method,monthly_charge,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,customer_status,population
0,0,37,1,0,Frazier Park,2,9,,1,42.39,...,1,1,65.6,593.3,0.0,0,381.51,974.81,Stayed,4498
1,1,46,0,0,Glendale,0,9,,1,10.69,...,0,1,-4.0,542.4,38.33,10,96.21,610.28,Stayed,31297
2,1,50,0,0,Costa Mesa,0,4,Offer E,1,33.65,...,1,0,73.9,280.85,0.0,0,134.6,415.45,Churned,62069
3,1,78,1,0,Martinez,1,13,Offer D,1,27.82,...,1,0,98.0,1237.85,0.0,0,361.66,1599.51,Churned,46677
4,0,75,1,0,Camarillo,3,3,,1,7.38,...,1,1,83.9,267.4,0.0,0,22.14,289.54,Churned,42853
