In [1]:
#!pip install imbalanced-learn

In [2]:
import sklearn
print(sklearn.__version__)

1.5.0


In [3]:
import numpy as np
import tensorflow as tf
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import pickle


In [4]:
# np.random.seed(47)
# tf.random.set_seed(6)

In [5]:
df = pd.read_csv("telco_customer_churn.csv")

In [6]:
dff = df.copy()

In [7]:
dff = dff.drop(columns=["customerID"])

In [8]:
categorical_columns = dff.select_dtypes(include="object").columns
categorical_columns

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn'],
      dtype='object')

In [9]:
# Converting All Value That Contain No .... Service Into No
dff.replace(["No internet service", "No phone service"], "No", inplace=True)

In [10]:
for i in categorical_columns:
    print(f"{i}:{dff[i].unique()}")

gender:['Female' 'Male']
Partner:['Yes' 'No']
Dependents:['No' 'Yes']
PhoneService:['No' 'Yes']
MultipleLines:['No' 'Yes']
InternetService:['DSL' 'Fiber optic' 'No']
OnlineSecurity:['No' 'Yes']
OnlineBackup:['Yes' 'No']
DeviceProtection:['No' 'Yes']
TechSupport:['No' 'Yes']
StreamingTV:['No' 'Yes']
StreamingMovies:['No' 'Yes']
Contract:['Month-to-month' 'One year' 'Two year']
PaperlessBilling:['Yes' 'No']
PaymentMethod:['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
TotalCharges:['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
Churn:['No' 'Yes']


In [11]:
dff['TotalCharges'] = pd.to_numeric(dff['TotalCharges'], errors='coerce')

In [12]:
# Mengisi nilai yang hilang dengan median
dff['TotalCharges'].fillna(dff['TotalCharges'].median(), inplace=True)

In [13]:
dff.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [14]:
# Encoding Values in Columns That Contains [Yes, No] With [1, 0]
yes_no_mapping = {
    "No": 0,
    "Yes": 1
}

try:
    for i in categorical_columns:
        flag = (dff[i].str.contains("No", regex=True).sum() > 0) and (dff[i].str.contains("Yes", regex=True).sum() > 0)
        if flag:
            dff[i] = dff[i].map(yes_no_mapping)
            
except Exception as e:
    print("All Columns Already Encoding")

All Columns Already Encoding


In [15]:
missing_values = dff.isna().sum()
print(missing_values)

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [16]:
X = dff.drop(columns=["Churn"])
y = dff["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=80) #80

In [17]:
categorical_features = X_train.select_dtypes("object").columns
categorical_preprocessing = Pipeline(steps=[
    ("Encdong", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [18]:
numerical_features = X_train.select_dtypes("number").columns
numerical_preprocessing = Pipeline(steps=[
    ("Scaling", MinMaxScaler())
])

In [19]:
transformer = ColumnTransformer(
    transformers=[
        ('Categorical', categorical_preprocessing, categorical_features),
        ('Numerical', numerical_preprocessing, numerical_features)
], remainder='passthrough')

transformer

In [20]:
# # Save Transformer
# pd.to_pickle(transformer, "transformer.pkl")

In [21]:
dff.dtypes

gender               object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [22]:
# Fit Transformer
transformer.fit(X_train)
# Save Transformer
with open('transformer.pkl', 'wb') as f:
    pickle.dump(transformer, f)

Build Model

In [24]:
# Transform the data
X_train_transformed = transformer.transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [25]:
# Define input dimension
input_dim = X_train_transformed.shape[1]
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(18, activation="relu", input_dim=input_dim),
    tf.keras.layers.Dense(18, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(2, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
#compile model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_transformed, y_test)
print(f"Test Accuracy: {accuracy}")

In [None]:
model.summary()

In [None]:
from tensorflow.keras.models import load_model

model = model.save('model.h5')