<img src="https://github.com/SimonSaban/image/blob/9a8eadf72b7bd40c779bc825c4faf825bafa7b53/soar-logo.png?raw=true" align="left" style="display: inline-block; margin-left: 10px; height: 180px;">
<h1 style="display: inline-block; color: #163E67; margin-left: 80px;">Swan Teleco - Tensor Flow Model for Customer Churn</h1>
<h2 align= "left">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;By: Aniko, Oaiss, Rachel & Simon</h2>

<h2 align= "left" style="color: #163E67">Prepare the data for modelling</h2>

In [1]:
# Install tensorflow_decision_forests
!pip install tensorflow_decision_forests

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow~=2.16.1 (from tensorflow_decision_forests)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting wurlitzer (from tensorflow_decision_forests)
  Downloading wurlitzer-3.1.0-py3-none-any.whl (8.4 kB)
Collecting tf-keras~=2.16 (from tensorflow_decision_forests)
  Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ydf (from tensorflow_decision_forests)
  Downloading ydf-0.4.3-cp310-

In [2]:
# Import required modules
import tensorflow_decision_forests as tfdf
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

In [18]:
# Read in the data
df = pd.read_excel("1 - Project Data.xlsx")

In [4]:
# View the data
df.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,Competitor had better devices


In [5]:
# Check the data types
df.dtypes

CustomerID            object
Count                  int64
Country               object
State                 object
City                  object
Zip Code               int64
Lat Long              object
Latitude             float64
Longitude            float64
Gender                object
Senior Citizen        object
Partner               object
Dependents            object
Tenure Months          int64
Phone Service         object
Multiple Lines        object
Internet Service      object
Online Security       object
Online Backup         object
Device Protection     object
Tech Support          object
Streaming TV          object
Streaming Movies      object
Contract              object
Paperless Billing     object
Payment Method        object
Monthly Charges      float64
Total Charges         object
Churn Label           object
Churn Value            int64
Churn Reason          object
dtype: object

### In case we want to keep the total charges column (although we found that it's best to remove it).

In [None]:
# Replace the whitespaces with nulls
#df['Total Charges'].replace(r'^\s*$', np.nan, regex=True, inplace = True)

In [None]:
# Check that there are no more entries with just a whitespace
#df['Total Charges'][df['Total Charges'] == " "]

Series([], Name: Total Charges, dtype: float64)

In [None]:
# Change the strings to floats
#df['Total Charges'] = df['Total Charges'].astype(float)

In [None]:
# Replace the nulls with the median of the column
#df['Total Charges'] = df['Total Charges'].fillna(df['Total Charges'].median())

<h2 align= "left" style="color: #163E67">Selecting features and target, train-test split</h2>

In [19]:
# We want to keep customer ID but don't want to use it in the model
# Set it as the index of the dataframe
df = df.set_index('CustomerID')

In [20]:
# Drop some columns
feature = list(df.columns)
feature.remove('Churn Label')  # same information as churn value
feature.remove('Churn Value')  # this is the target
feature.remove('Country')      # same in all the rows
feature.remove('State')        # same in all the rows
feature.remove('Count')        # same in all the rows
feature.remove('Churn Reason') # a lot of nulls (for existing customers)
feature.remove('Lat Long')     # unnecessary
feature.remove('Latitude')     # unnecessary
feature.remove('Longitude')    # unnecessary
feature.remove('Total Charges')# unnecessary, total charges = monthly charges * tenure length
feature.remove('City')         # unnecessary

In [21]:
# Select the features and the target
y = df['Churn Value']
X = df[feature]

In [9]:
# Check what we've done
feature

['Zip Code',
 'Gender',
 'Senior Citizen',
 'Partner',
 'Dependents',
 'Tenure Months',
 'Phone Service',
 'Multiple Lines',
 'Internet Service',
 'Online Security',
 'Online Backup',
 'Device Protection',
 'Tech Support',
 'Streaming TV',
 'Streaming Movies',
 'Contract',
 'Paperless Billing',
 'Payment Method',
 'Monthly Charges']

In [22]:
## We do the split, then bring back the targets because tensorflow needs them
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1204, stratify=y)
X_train['Churn_Value'] = y_train
X_test['Churn_Value'] = y_test

In [23]:
# Check the columns in X_test
X_test.columns

Index(['Zip Code', 'Gender', 'Senior Citizen', 'Partner', 'Dependents',
       'Tenure Months', 'Phone Service', 'Multiple Lines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
       'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charges', 'Churn_Value'],
      dtype='object')

<h2 align= "left" style="color: #163E67">Modelling</h2>

In [24]:
# Work with datasets instead of dataframes to optimise runtime
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_train, label='Churn_Value')





In [25]:
# Choose a model
model = tfdf.keras.RandomForestModel()
# Fit the model to the training set
model.fit(train_ds)

Use /tmp/tmpdft2wl3y as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.364142. Found 5634 examples.
Training model...
Model trained in 0:00:03.896457
Compiling model...
Model compiled.


<tf_keras.src.callbacks.History at 0x7fa5cc995570>

In [26]:
# Turn the test data into a dataset
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(X_test, label="Churn_Value")





In [32]:
# We compile the model and focus on 'accuracy' as our primary metric
model.compile(metrics=["accuracy"])
# Evaluate the model performance on the test set
print(model.evaluate(test_ds))

[0.0, 0.8069552779197693]


In [33]:
# Check the model performance on the train set
print(model.evaluate(train_ds))

[0.0, 0.9348597526550293]


In [34]:
# Display the decision tree
tfdf.model_plotter.plot_model_in_colab(model, tree_idx=0)

In [35]:
# A comprehensive summary of the model
model.summary()

Model: "random_forest_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (19):
	Contract
	Dependents
	Device_Protection
	Gender
	Internet_Service
	Monthly_Charges
	Multiple_Lines
	Online_Backup
	Online_Security
	Paperless_Billing
	Partner
	Payment_Method
	Phone_Service
	Senior_Citizen
	Streaming_Movies
	Streaming_TV
	Tech_Support
	Tenure_Months
	Zip_Code

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.     "Tenure_Months"  0.232549 ################
    2.          "Contract"  0.231207 ###############
    3.   "Monthly_Charges"  0.152519 #######
    4.        "Dependents"  0.152491 #######
    5.   "Online_Security"  0.146040 ######
    6.  "Internet_Se

<h2 align= "left" style="color: #163E67">Fine tuning the model</h2>

In [36]:
# Fine tune the model with tensorflow's built-in fine tuner
tuner = tfdf.tuner.RandomSearch(num_trials=20)

In [37]:
# Fit the improved model the to train set
model_improv = tfdf.keras.RandomForestModel(tuner=tuner)
model_improv.fit(train_ds)

Use /tmp/tmporf6oevk as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.370149. Found 5634 examples.
Training model...
Model trained in 0:00:04.319175
Compiling model...
Model compiled.


<tf_keras.src.callbacks.History at 0x7fa5c86daec0>

In [38]:
# Evaluate the improved model on the test set
model_improv.compile(metrics=["accuracy"])
print(model_improv.evaluate(test_ds))

[0.0, 0.8069552779197693]


<h2 align= "left" style="color: #163E67">Producing the .csv files</h2>

In [39]:
# Calculate the churn probabilities for the training set
X_train['Probability of Churning'] = model_improv.predict(train_ds)



In [40]:
# Calculate the churn probabilities for the test set
X_test['Probability of Churning'] = model_improv.predict(test_ds)





In [41]:
# Put the train and test sets together
final_df = pd.concat([X_train, X_test])

In [42]:
# Check that it worked
final_df.head()

Unnamed: 0_level_0,Zip Code,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Churn_Value,Probability of Churning
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3910-MRQOY,95555,Female,No,Yes,No,72,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),19.4,0,0.0
8224-UAXBZ,95827,Female,No,Yes,No,58,No,No phone service,DSL,Yes,...,No,No,Yes,No,One year,Yes,Electronic check,45.35,0,0.043333
5816-QVHRX,94579,Female,No,No,No,37,Yes,Yes,Fiber optic,No,...,No,Yes,Yes,Yes,Month-to-month,No,Credit card (automatic),100.3,0,0.286666
6569-KTMDU,95826,Female,No,No,No,1,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,19.1,0,0.16
0310-VQXAM,92404,Male,No,No,No,9,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,19.85,0,0.05


In [43]:
# Check the dimensions of the dataframe
final_df.shape

(7043, 21)

In [44]:
# Reset the index so customer ID turns from index to a column in the dataframe
final_df = final_df.reset_index()


In [45]:
# Select only the customer ID and the probability of churning columns
final_df = final_df[['CustomerID', 'Probability of Churning']]

In [46]:
# Sort by probability, from highest to lowest
final_df = final_df.sort_values(by = 'Probability of Churning', ascending = False)

In [47]:
# Check the first five to make sure the sorting worked
final_df.head()

Unnamed: 0,CustomerID,Probability of Churning
2999,9282-IZGQK,0.999999
3803,1069-XAIEM,0.996666
5506,8149-RSOUN,0.996666
4839,8740-CRYFY,0.996666
4574,7294-TMAOP,0.993333


In [48]:
# Save it into a .csv file
final_df.to_csv('churn_rates.csv', index = False)

In [49]:
# Select the 500 customers with the highest probability of churning
top_500 = final_df.head(500)

In [50]:
# Check that the result has 500 rows and 2 columns
top_500.shape

(500, 2)

In [51]:
# Check the last 5 entries
top_500.tail()

Unnamed: 0,CustomerID,Probability of Churning
4980,7903-CMPEY,0.813333
1053,6210-KBBPI,0.813333
154,3045-XETSH,0.813333
1740,1450-GALXR,0.813333
1117,7028-DVOIQ,0.813333


In [52]:
# Save it into a .csv file
top_500.to_csv('top_500.csv', index = False)