### Imports

In [93]:
import pandas as pd
import numpy as np

import os, sys

# add my own file with custom utility functions as a module
utils_path = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname('__file__'), os.path.pardir, 'utils')))
if utils_path not in sys.path:
    sys.path.append(utils_path)

import aku_utils as ak

# pandas options
pd.options.display.max_columns = 100
pd.options.display.max_rows =  200
# pd.options.display.max_info_rows = 1690785
pd.options.display.max_info_columns = 200
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.date_dayfirst = True
pd.options.mode.chained_assignment = None

In [94]:
# import importlib
# importlib.reload(ak)

# Overview

In [95]:
df = pd.read_csv(os.path.join(os.path.dirname('__file__'), os.path.pardir, 'data', 'telco.csv'))
df.head(-5)

Unnamed: 0,Customer ID,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents,Country,State,City,Zip Code,Latitude,Longitude,Population,Quarter,Referred a Friend,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,Internet Service,Internet Type,Avg Monthly GB Download,Online Security,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Satisfaction Score,Customer Status,Churn Label,Churn Score,CLTV,Churn Category,Churn Reason
0,8779-QRDMV,Male,78,No,Yes,No,No,0,United States,California,Los Angeles,90022,34.02,-118.16,68701,Q3,No,0,1,,No,0.00,No,Yes,DSL,8,No,No,Yes,No,No,Yes,No,No,Month-to-Month,Yes,Bank Withdrawal,39.65,39.65,0.00,20,0.00,59.65,3,Churned,Yes,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,Female,74,No,Yes,Yes,Yes,1,United States,California,Los Angeles,90063,34.04,-118.19,55668,Q3,Yes,1,8,Offer E,Yes,48.85,Yes,Yes,Fiber Optic,17,No,Yes,No,No,No,No,No,Yes,Month-to-Month,Yes,Credit Card,80.65,633.30,0.00,0,390.80,1024.10,3,Churned,Yes,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,Male,71,No,Yes,No,Yes,3,United States,California,Los Angeles,90065,34.11,-118.23,47534,Q3,No,0,18,Offer D,Yes,11.33,Yes,Yes,Fiber Optic,52,No,No,No,No,Yes,Yes,Yes,Yes,Month-to-Month,Yes,Bank Withdrawal,95.45,1752.55,45.61,0,203.94,1910.88,2,Churned,Yes,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,Female,78,No,Yes,Yes,Yes,1,United States,California,Inglewood,90303,33.94,-118.33,27778,Q3,Yes,1,25,Offer C,Yes,19.76,No,Yes,Fiber Optic,12,No,Yes,Yes,No,Yes,Yes,No,Yes,Month-to-Month,Yes,Bank Withdrawal,98.50,2514.50,13.43,0,494.00,2995.07,2,Churned,Yes,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,Female,80,No,Yes,Yes,Yes,1,United States,California,Whittier,90602,33.97,-118.02,26265,Q3,Yes,1,37,Offer C,Yes,6.33,Yes,Yes,Fiber Optic,14,No,No,No,No,No,No,No,Yes,Month-to-Month,Yes,Bank Withdrawal,76.50,2868.15,0.00,0,234.21,3102.36,2,Churned,Yes,67,2793,Price,Extra data charges
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7033,9281-CEDRU,Female,23,Yes,No,Yes,No,0,United States,California,Salton City,92275,33.28,-115.96,799,Q3,Yes,2,68,,Yes,8.62,No,Yes,DSL,53,No,Yes,No,Yes,Yes,No,No,Yes,Two Year,No,Bank Withdrawal,64.10,4326.25,19.12,0,586.16,4893.29,5,Stayed,No,48,5553,,
7034,0871-OPBXW,Female,57,No,No,No,No,0,United States,California,Escondido,92027,33.14,-116.97,48690,Q3,No,0,2,Offer E,Yes,6.85,No,No,,0,No,No,No,No,No,No,No,No,Month-to-Month,Yes,Mailed Check,20.05,39.25,0.00,0,13.70,52.95,3,Joined,No,80,5191,,
7035,9767-FFLEM,Male,63,No,No,No,No,0,United States,California,Westmorland,92281,33.04,-115.61,2388,Q3,No,0,38,,Yes,35.04,No,Yes,Fiber Optic,2,No,No,No,No,No,No,No,Yes,Month-to-Month,Yes,Credit Card,69.50,2625.25,20.19,0,1331.52,3936.58,3,Stayed,No,35,4591,,
7036,8456-QDAVC,Male,57,No,No,No,No,0,United States,California,Winterhaven,92283,32.85,-114.85,3663,Q3,No,0,19,,Yes,29.55,No,Yes,Fiber Optic,13,No,No,No,No,Yes,No,No,Yes,Month-to-Month,Yes,Bank Withdrawal,78.70,1495.10,26.84,0,561.45,2029.71,3,Stayed,No,20,2464,,


### Data description

Also available at 'data\desc.txt'.

CustomerID: A unique ID that identifies each customer.

Gender: The customer’s gender: Male, Female

Age: The customer’s current age, in years, at the time the fiscal quarter ended.

Senior Citizen: Indicates if the customer is 65 or older: Yes, No

Married: Indicates if the customer is married: Yes, No

Dependents: Indicates if the customer lives with any dependents: Yes, No. Dependents could be children, parents, grandparents, etc.

Number of Dependents: Indicates the number of dependents that live with the customer.

CustomerID: A unique ID that identifies each customer.

Count: A value used in reporting/dashboarding to sum up the number of customers in a filtered set.

Country: The country of the customer’s primary residence.

State: The state of the customer’s primary residence.

City: The city of the customer’s primary residence.

Zip Code: The zip code of the customer’s primary residence.

Latitude: The latitude of the customer’s primary residence.

Longitude: The longitude of the customer’s primary residence.

Zip Code: The zip code of the customer’s primary residence.

Population: A current population estimate for the entire Zip Code area.

CustomerID: A unique ID that identifies each customer.

Count: A value used in reporting/dashboarding to sum up the number of customers in a filtered set.

Quarter: The fiscal quarter that the data has been derived from (e.g. Q3).

Referred a Friend: Indicates if the customer has ever referred a friend or family member to this company: Yes, No

Number of Referrals: Indicates the number of referrals to date that the customer has made.

Tenure in Months: Indicates the total amount of months that the customer has been with the company by the end of the quarter specified above.

Offer: Identifies the last marketing offer that the customer accepted, if applicable. Values include None, Offer A, Offer B, Offer C, Offer D, and Offer E.

Phone Service: Indicates if the customer subscribes to home phone service with the company: Yes, No

Avg Monthly Long Distance Charges: Indicates the customer’s average long distance charges, calculated to the end of the quarter specified above.

Multiple Lines: Indicates if the customer subscribes to multiple telephone lines with the company: Yes, No

Internet Service: Indicates if the customer subscribes to Internet service with the company: No, DSL, Fiber Optic, Cable.

Avg Monthly GB Download: Indicates the customer’s average download volume in gigabytes, calculated to the end of the quarter specified above.

Online Security: Indicates if the customer subscribes to an additional online security service provided by the company: Yes, No

Online Backup: Indicates if the customer subscribes to an additional online backup service provided by the company: Yes, No

Device Protection Plan: Indicates if the customer subscribes to an additional device protection plan for their Internet equipment provided by the company: Yes, No

Premium Tech Support: Indicates if the customer subscribes to an additional technical support plan from the company with reduced wait times: Yes, No

Streaming TV: Indicates if the customer uses their Internet service to stream television programing from a third party provider: Yes, No. The company does not charge an additional fee for this service.

Streaming Movies: Indicates if the customer uses their Internet service to stream movies from a third party provider: Yes, No. The company does not charge an additional fee for this service.

Streaming Music: Indicates if the customer uses their Internet service to stream music from a third party provider: Yes, No. The company does not charge an additional fee for this service.

Unlimited Data: Indicates if the customer has paid an additional monthly fee to have unlimited data downloads/uploads: Yes, No

Contract: Indicates the customer’s current contract type: Month-to-Month, One Year, Two Year.

Paperless Billing: Indicates if the customer has chosen paperless billing: Yes, No

Payment Method: Indicates how the customer pays their bill: Bank Withdrawal, Credit Card, Mailed Check

Monthly Charge: Indicates the customer’s current total monthly charge for all their services from the company.

Total Charges: Indicates the customer’s total charges, calculated to the end of the quarter specified above.

Total Refunds: Indicates the customer’s total refunds, calculated to the end of the quarter specified above.

Total Extra Data Charges: Indicates the customer’s total charges for extra data downloads above those specified in their plan, by the end of the quarter specified above.

Total Long Distance Charges: Indicates the customer’s total charges for long distance above those specified in their plan, by the end of the quarter specified above.

CustomerID: A unique ID that identifies each customer.

Count: A value used in reporting/dashboarding to sum up the number of customers in a filtered set.

Quarter: The fiscal quarter that the data has been derived from (e.g. Q3).

Satisfaction Score: A customer’s overall satisfaction rating of the company from 1 (Very Unsatisfied) to 5 (Very Satisfied).

Satisfaction Score Label: Indicates the text version of the score (1-5) as a text string.

Customer Status: Indicates the status of the customer at the end of the quarter: Churned, Stayed, or Joined

Churn Label: Yes = the customer left the company this quarter. No = the customer remained with the company. Directly related to Churn Value.

Churn Value: 1 = the customer left the company this quarter. 0 = the customer remained with the company. Directly related to Churn Label.

Churn Score: A value from 0-100 that is calculated using the predictive tool IBM SPSS Modeler. The model incorporates multiple factors known to cause churn. The higher the score, the more likely the customer will churn.

Churn Score Category: A calculation that assigns a Churn Score to one of the following categories: 0-10, 11-20, 21-30, 31-40, 41-50, 51-60, 61-70, 71-80, 81-90, and 91-100

CLTV: Customer Lifetime Value. A predicted CLTV is calculated using corporate formulas and existing data. The higher the value, the more valuable the customer. High value customers should be monitored for churn.

CLTV Category: A calculation that assigns a CLTV value to one of the following categories: 2000-2500, 2501-3000, 3001-3500, 3501-4000, 4001-4500, 4501-5000, 5001-5500, 5501-6000, 6001-6500, and 6501-7000.

Churn Category: A high-level category for the customer’s reason for churning: Attitude, Competitor, Dissatisfaction, Other, Price. When they leave the company, all customers are asked about their reasons for leaving. Directly related to Churn Reason.

Churn Reason: A customer’s specific reason for leaving the company. Directly related to Churn Category.

### Cont.

In [96]:
ak.na(df)

% of NaNs in df:


Offer            0.55
Internet Type    0.22
Churn Category   0.73
Churn Reason     0.73
dtype: float64

Despite many missing values, there is nothing to be deleted:
* NaN in Offer means that the client has not accepted any marketing offers
* NaN in Internet Type means client does not have internet service with the company
* NaN in Churn Category or Churn Reason means client hasn't churned

In [97]:
ak.type_breakdown(df)

type
object     31
int64      11
float64     8
dtype: int64

Most of our data is of object type, but what you can see in the `head`, it is mostly boolean columns with yes and no values. 

# Preprocessing

First of all, I prefer lowered columns joined by underscores instead of whitespaces, so we change that.

In [98]:
df.columns = ['_'.join(col.lower().split()) for col in df.columns]
df.columns

Index(['customer_id', 'gender', 'age', 'under_30', 'senior_citizen', 'married',
       'dependents', 'number_of_dependents', 'country', 'state', 'city',
       'zip_code', 'latitude', 'longitude', 'population', 'quarter',
       'referred_a_friend', 'number_of_referrals', 'tenure_in_months', 'offer',
       'phone_service', 'avg_monthly_long_distance_charges', 'multiple_lines',
       'internet_service', 'internet_type', 'avg_monthly_gb_download',
       'online_security', 'online_backup', 'device_protection_plan',
       'premium_tech_support', 'streaming_tv', 'streaming_movies',
       'streaming_music', 'unlimited_data', 'contract', 'paperless_billing',
       'payment_method', 'monthly_charge', 'total_charges', 'total_refunds',
       'total_extra_data_charges', 'total_long_distance_charges',
       'total_revenue', 'satisfaction_score', 'customer_status', 'churn_label',
       'churn_score', 'cltv', 'churn_category', 'churn_reason'],
      dtype='object')

customer_id is useless because it is unique, zip_code is useless because we have latitude and longitude, so we delete these columns

In [99]:
df = df.drop(['customer_id', 'zip_code'], axis=1)
df

Unnamed: 0,gender,age,under_30,senior_citizen,married,dependents,number_of_dependents,country,state,city,latitude,longitude,population,quarter,referred_a_friend,number_of_referrals,tenure_in_months,offer,phone_service,avg_monthly_long_distance_charges,multiple_lines,internet_service,internet_type,avg_monthly_gb_download,online_security,online_backup,device_protection_plan,premium_tech_support,streaming_tv,streaming_movies,streaming_music,unlimited_data,contract,paperless_billing,payment_method,monthly_charge,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,satisfaction_score,customer_status,churn_label,churn_score,cltv,churn_category,churn_reason
0,Male,78,No,Yes,No,No,0,United States,California,Los Angeles,34.02,-118.16,68701,Q3,No,0,1,,No,0.00,No,Yes,DSL,8,No,No,Yes,No,No,Yes,No,No,Month-to-Month,Yes,Bank Withdrawal,39.65,39.65,0.00,20,0.00,59.65,3,Churned,Yes,91,5433,Competitor,Competitor offered more data
1,Female,74,No,Yes,Yes,Yes,1,United States,California,Los Angeles,34.04,-118.19,55668,Q3,Yes,1,8,Offer E,Yes,48.85,Yes,Yes,Fiber Optic,17,No,Yes,No,No,No,No,No,Yes,Month-to-Month,Yes,Credit Card,80.65,633.30,0.00,0,390.80,1024.10,3,Churned,Yes,69,5302,Competitor,Competitor made better offer
2,Male,71,No,Yes,No,Yes,3,United States,California,Los Angeles,34.11,-118.23,47534,Q3,No,0,18,Offer D,Yes,11.33,Yes,Yes,Fiber Optic,52,No,No,No,No,Yes,Yes,Yes,Yes,Month-to-Month,Yes,Bank Withdrawal,95.45,1752.55,45.61,0,203.94,1910.88,2,Churned,Yes,81,3179,Competitor,Competitor made better offer
3,Female,78,No,Yes,Yes,Yes,1,United States,California,Inglewood,33.94,-118.33,27778,Q3,Yes,1,25,Offer C,Yes,19.76,No,Yes,Fiber Optic,12,No,Yes,Yes,No,Yes,Yes,No,Yes,Month-to-Month,Yes,Bank Withdrawal,98.50,2514.50,13.43,0,494.00,2995.07,2,Churned,Yes,88,5337,Dissatisfaction,Limited range of services
4,Female,80,No,Yes,Yes,Yes,1,United States,California,Whittier,33.97,-118.02,26265,Q3,Yes,1,37,Offer C,Yes,6.33,Yes,Yes,Fiber Optic,14,No,No,No,No,No,No,No,Yes,Month-to-Month,Yes,Bank Withdrawal,76.50,2868.15,0.00,0,234.21,3102.36,2,Churned,Yes,67,2793,Price,Extra data charges
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,30,No,No,No,No,0,United States,California,Landers,34.34,-116.54,2182,Q3,No,0,72,,Yes,22.77,No,No,,0,No,No,No,No,No,No,No,No,Two Year,Yes,Bank Withdrawal,21.15,1419.40,19.31,0,1639.44,3039.53,5,Stayed,No,45,5306,,
7039,Male,38,No,No,Yes,Yes,2,United States,California,Adelanto,34.67,-117.54,18980,Q3,Yes,1,24,Offer C,Yes,36.05,Yes,Yes,Cable,24,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,One Year,Yes,Mailed Check,84.80,1990.50,48.23,0,865.20,2807.47,3,Stayed,No,59,2140,,
7040,Female,30,No,No,Yes,Yes,2,United States,California,Amboy,34.56,-115.64,42,Q3,Yes,4,72,,Yes,29.66,Yes,Yes,Fiber Optic,59,No,Yes,Yes,No,Yes,Yes,Yes,Yes,One Year,Yes,Credit Card,103.20,7362.90,45.38,0,2135.52,9453.04,4,Stayed,No,71,5560,,
7041,Female,32,No,No,Yes,Yes,2,United States,California,Angelus Oaks,34.17,-116.86,301,Q3,Yes,1,11,,No,0.00,No,Yes,DSL,17,Yes,No,No,No,No,No,No,Yes,Month-to-Month,Yes,Bank Withdrawal,29.60,346.45,27.24,0,0.00,319.21,4,Stayed,No,59,2793,,


Many columns only have Yes and No as values, check if its true for binary encoding

In [100]:
binary_cols = []

for col in df.columns:
    if df[col].nunique() < 3 and set(df[col].unique().tolist()) == {'Yes', 'No'}: # nunique condition is there to try to save compute time on
        binary_cols.append(col)

binary_cols

['under_30',
 'senior_citizen',
 'married',
 'dependents',
 'referred_a_friend',
 'phone_service',
 'multiple_lines',
 'internet_service',
 'online_security',
 'online_backup',
 'device_protection_plan',
 'premium_tech_support',
 'streaming_tv',
 'streaming_movies',
 'streaming_music',
 'unlimited_data',
 'paperless_billing',
 'churn_label']

In [101]:
# encoding
for bcol in binary_cols:
    df[bcol] = (df[bcol] == 'Yes').astype('int')

Lets check the number of unique values

In [102]:
srs = df.nunique().sort_values()
srs[srs < 4]

country                   1
state                     1
quarter                   1
gender                    2
streaming_movies          2
streaming_tv              2
premium_tech_support      2
device_protection_plan    2
online_backup             2
online_security           2
internet_service          2
multiple_lines            2
phone_service             2
paperless_billing         2
streaming_music           2
unlimited_data            2
dependents                2
married                   2
senior_citizen            2
under_30                  2
churn_label               2
referred_a_friend         2
payment_method            3
contract                  3
customer_status           3
internet_type             3
dtype: int64

country, state and quarter have only one value. We cannot do anything with such columns, so we will drop them.

In [103]:
df = df.drop(['country', 'state', 'quarter'], axis=1)
df

Unnamed: 0,gender,age,under_30,senior_citizen,married,dependents,number_of_dependents,city,latitude,longitude,population,referred_a_friend,number_of_referrals,tenure_in_months,offer,phone_service,avg_monthly_long_distance_charges,multiple_lines,internet_service,internet_type,avg_monthly_gb_download,online_security,online_backup,device_protection_plan,premium_tech_support,streaming_tv,streaming_movies,streaming_music,unlimited_data,contract,paperless_billing,payment_method,monthly_charge,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,satisfaction_score,customer_status,churn_label,churn_score,cltv,churn_category,churn_reason
0,Male,78,0,1,0,0,0,Los Angeles,34.02,-118.16,68701,0,0,1,,0,0.00,0,1,DSL,8,0,0,1,0,0,1,0,0,Month-to-Month,1,Bank Withdrawal,39.65,39.65,0.00,20,0.00,59.65,3,Churned,1,91,5433,Competitor,Competitor offered more data
1,Female,74,0,1,1,1,1,Los Angeles,34.04,-118.19,55668,1,1,8,Offer E,1,48.85,1,1,Fiber Optic,17,0,1,0,0,0,0,0,1,Month-to-Month,1,Credit Card,80.65,633.30,0.00,0,390.80,1024.10,3,Churned,1,69,5302,Competitor,Competitor made better offer
2,Male,71,0,1,0,1,3,Los Angeles,34.11,-118.23,47534,0,0,18,Offer D,1,11.33,1,1,Fiber Optic,52,0,0,0,0,1,1,1,1,Month-to-Month,1,Bank Withdrawal,95.45,1752.55,45.61,0,203.94,1910.88,2,Churned,1,81,3179,Competitor,Competitor made better offer
3,Female,78,0,1,1,1,1,Inglewood,33.94,-118.33,27778,1,1,25,Offer C,1,19.76,0,1,Fiber Optic,12,0,1,1,0,1,1,0,1,Month-to-Month,1,Bank Withdrawal,98.50,2514.50,13.43,0,494.00,2995.07,2,Churned,1,88,5337,Dissatisfaction,Limited range of services
4,Female,80,0,1,1,1,1,Whittier,33.97,-118.02,26265,1,1,37,Offer C,1,6.33,1,1,Fiber Optic,14,0,0,0,0,0,0,0,1,Month-to-Month,1,Bank Withdrawal,76.50,2868.15,0.00,0,234.21,3102.36,2,Churned,1,67,2793,Price,Extra data charges
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Female,30,0,0,0,0,0,Landers,34.34,-116.54,2182,0,0,72,,1,22.77,0,0,,0,0,0,0,0,0,0,0,0,Two Year,1,Bank Withdrawal,21.15,1419.40,19.31,0,1639.44,3039.53,5,Stayed,0,45,5306,,
7039,Male,38,0,0,1,1,2,Adelanto,34.67,-117.54,18980,1,1,24,Offer C,1,36.05,1,1,Cable,24,1,0,1,1,1,1,1,1,One Year,1,Mailed Check,84.80,1990.50,48.23,0,865.20,2807.47,3,Stayed,0,59,2140,,
7040,Female,30,0,0,1,1,2,Amboy,34.56,-115.64,42,1,4,72,,1,29.66,1,1,Fiber Optic,59,0,1,1,0,1,1,1,1,One Year,1,Credit Card,103.20,7362.90,45.38,0,2135.52,9453.04,4,Stayed,0,71,5560,,
7041,Female,32,0,0,1,1,2,Angelus Oaks,34.17,-116.86,301,1,1,11,,0,0.00,0,1,DSL,17,1,0,0,0,0,0,0,1,Month-to-Month,1,Bank Withdrawal,29.60,346.45,27.24,0,0.00,319.21,4,Stayed,0,59,2793,,


In [92]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

gender is also binary, we will encode it so 'Male' means 1

In [104]:
df['gender'] = (df['gender'] == 'Male').astype('int')
df['gender']

0       1
1       0
2       1
3       0
4       0
       ..
7038    0
7039    1
7040    0
7041    0
7042    1
Name: gender, Length: 7043, dtype: int32

Additional age variables like under_30 and senior_citizen are not useful to us, so we will drop them.

In [113]:
df = df.drop(['under_30', 'senior_citizen'], axis=1)

In [108]:
df.groupby('customer_status').size()

customer_status
Churned    1869
Joined      454
Stayed     4720
dtype: int64

Information on churn is already there in variable churn_label, so having customer_status does not really make sense. We will make another binary column new_customer and drop customer_status.

In [109]:
df['new_customer'] = (df['customer_status'] == 'Joined').astype(int)
df['new_customer'].value_counts()

new_customer
0    6589
1     454
Name: count, dtype: int64

In [110]:
df = df.drop('customer_status', axis=1)

We have churn_score which is the probability of churn metric created by dataset creator in IBM SPSS Modeler. We will rename the column and make it into one of the benchmarks of my own model.

In [115]:
df = df.rename({'churn_score' : 'ibm_spss_churn_prob'}, axis=1)

In [116]:
df

Unnamed: 0,gender,age,married,dependents,number_of_dependents,city,latitude,longitude,population,referred_a_friend,number_of_referrals,tenure_in_months,offer,phone_service,avg_monthly_long_distance_charges,multiple_lines,internet_service,internet_type,avg_monthly_gb_download,online_security,online_backup,device_protection_plan,premium_tech_support,streaming_tv,streaming_movies,streaming_music,unlimited_data,contract,paperless_billing,payment_method,monthly_charge,total_charges,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue,satisfaction_score,churn_label,ibm_spss_churn_prob,cltv,churn_category,churn_reason,new_customer
0,1,78,0,0,0,Los Angeles,34.02,-118.16,68701,0,0,1,,0,0.00,0,1,DSL,8,0,0,1,0,0,1,0,0,Month-to-Month,1,Bank Withdrawal,39.65,39.65,0.00,20,0.00,59.65,3,1,91,5433,Competitor,Competitor offered more data,0
1,0,74,1,1,1,Los Angeles,34.04,-118.19,55668,1,1,8,Offer E,1,48.85,1,1,Fiber Optic,17,0,1,0,0,0,0,0,1,Month-to-Month,1,Credit Card,80.65,633.30,0.00,0,390.80,1024.10,3,1,69,5302,Competitor,Competitor made better offer,0
2,1,71,0,1,3,Los Angeles,34.11,-118.23,47534,0,0,18,Offer D,1,11.33,1,1,Fiber Optic,52,0,0,0,0,1,1,1,1,Month-to-Month,1,Bank Withdrawal,95.45,1752.55,45.61,0,203.94,1910.88,2,1,81,3179,Competitor,Competitor made better offer,0
3,0,78,1,1,1,Inglewood,33.94,-118.33,27778,1,1,25,Offer C,1,19.76,0,1,Fiber Optic,12,0,1,1,0,1,1,0,1,Month-to-Month,1,Bank Withdrawal,98.50,2514.50,13.43,0,494.00,2995.07,2,1,88,5337,Dissatisfaction,Limited range of services,0
4,0,80,1,1,1,Whittier,33.97,-118.02,26265,1,1,37,Offer C,1,6.33,1,1,Fiber Optic,14,0,0,0,0,0,0,0,1,Month-to-Month,1,Bank Withdrawal,76.50,2868.15,0.00,0,234.21,3102.36,2,1,67,2793,Price,Extra data charges,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,30,0,0,0,Landers,34.34,-116.54,2182,0,0,72,,1,22.77,0,0,,0,0,0,0,0,0,0,0,0,Two Year,1,Bank Withdrawal,21.15,1419.40,19.31,0,1639.44,3039.53,5,0,45,5306,,,0
7039,1,38,1,1,2,Adelanto,34.67,-117.54,18980,1,1,24,Offer C,1,36.05,1,1,Cable,24,1,0,1,1,1,1,1,1,One Year,1,Mailed Check,84.80,1990.50,48.23,0,865.20,2807.47,3,0,59,2140,,,0
7040,0,30,1,1,2,Amboy,34.56,-115.64,42,1,4,72,,1,29.66,1,1,Fiber Optic,59,0,1,1,0,1,1,1,1,One Year,1,Credit Card,103.20,7362.90,45.38,0,2135.52,9453.04,4,0,71,5560,,,0
7041,0,32,1,1,2,Angelus Oaks,34.17,-116.86,301,1,1,11,,0,0.00,0,1,DSL,17,1,0,0,0,0,0,0,1,Month-to-Month,1,Bank Withdrawal,29.60,346.45,27.24,0,0.00,319.21,4,0,59,2793,,,0


No categorical encoding will be done here in case I train a model that has native category support, like CatBoost.

In [118]:
df.to_csv(os.path.join(os.path.dirname('__file__'), os.path.pardir, 'data', 'telco_processed.csv'), index=False)

# Insights

* Dataset contains personal information on customers, not only the information that reflects their relationship with the company, which will allow us to build a portrait of a customer
* The information on types of services clients use is very deep, which will allow us to better understand what it is that our customers are unsatisfied with
* The dataset is very clean and did not need any significant processing
* We have a benchmark built right into the dataset, which is very good for model evaluation purposes
