In [None]:
import pandas as pd 

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [9]:
#--- Step 1: Data Type Conversion and Missing Value Handling ---
# The 'TotalCharges' column is of object type and contains spaces.
# These spaces represent missing values and need to be handled.
# We'll convert these spaces to NaN and then drop the rows.
df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan)
df.dropna(subset=['TotalCharges'], inplace=True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

# --- Step 2: Drop Unnecessary Columns ---
# The 'customerID' column is unique to each customer and does not
# provide any predictive value for the model.
df.drop('customerID', axis=1, inplace=True)

# --- Step 3: Categorical Feature Encoding ---
# The target variable 'Churn' is a categorical string ('Yes', 'No').
# We'll convert it to a numerical format (1, 0) for model training.
df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})

print("Preprocessing complete. Here is the info of the cleaned dataset:")
print(df.info())
print("\nFirst 5 rows of the preprocessed DataFrame:")
print(df.head())

Preprocessing complete. Here is the info of the cleaned dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-nu

  df['Churn'] = df['Churn'].replace({'Yes': 1, 'No': 0})


In [10]:
# List of columns to convert
columns_to_convert = ['Partner', 'Dependents', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']

# Convert 'Yes' to 1 and 'No' to 0 for the specified columns
for col in columns_to_convert:
    df[col] = df[col].replace({'Yes': 1, 'No': 0})

print("Columns converted successfully. Here is the first 5 rows of the updated DataFrame:")
print(df.head())

Columns converted successfully. Here is the first 5 rows of the updated DataFrame:
   gender  SeniorCitizen  Partner  Dependents  tenure PhoneService  \
0  Female              0        1           0       1           No   
1    Male              0        0           0      34          Yes   
2    Male              0        0           0       2          Yes   
3    Male              0        0           0      45           No   
4  Female              0        0           0       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL              0            1   
1                No             DSL              1            0   
2                No             DSL              1            1   
3  No phone service             DSL              1            0   
4                No     Fiber optic              0            0   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0              

  df[col] = df[col].replace({'Yes': 1, 'No': 0})


In [11]:
df_encoded = pd.get_dummies(df, drop_first=True, dtype=int)

print("All categorical columns have been converted to numerical format.")
print("\nHere is the information about the new encoded DataFrame:")
print(df_encoded.info())
print("\nHere are the first 5 rows of the new DataFrame with all numerical values:")
print(df_encoded.head())

All categorical columns have been converted to numerical format.

Here is the information about the new encoded DataFrame:
<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          7032 non-null   int64  
 1   Partner                                7032 non-null   int64  
 2   Dependents                             7032 non-null   int64  
 3   tenure                                 7032 non-null   int64  
 4   PaperlessBilling                       7032 non-null   int64  
 5   MonthlyCharges                         7032 non-null   float64
 6   TotalCharges                           7032 non-null   float64
 7   Churn                                  7032 non-null   int64  
 8   gender_Male                            7032 non-null   int32  
 9   PhoneService_Yes      

In [12]:
from sklearn.preprocessing import LabelEncoder

# Binary columns
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
le = LabelEncoder()
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

# One-Hot Encoding for multi-category columns
cat_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
            'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
            'Contract', 'PaymentMethod']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[num_cols] = scaler.fit_transform(df[num_cols])

In [15]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.3/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 985.5 kB/s eta 0:00:58
    --------------------------------------- 0.8/56.8 MB 1.0 MB/s eta 0:00:54
    --------------------------------------- 1.0/56.8 MB 1.2 MB/s eta 0:00:48
    --------------------------------------- 1.3/56.8 MB 1.2 MB/s eta 0:00:46
   - -------------------------------------- 1.6/56.8 MB 1.2 MB/s eta 0:00:47
   - -------------------------------------- 1.8/56.8 MB 1.3 MB/s eta 0:00:44
   - -------------------------------------- 2.4/56.8 MB 1.4 MB/s eta 0:00:39
   - -------------------------------------- 2.6/56.8 MB 1.4 MB/s eta 0:00:39
   -- -------------------

In [17]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp311-cp311-win_amd64.whl (102.5 MB)
   ---------------------------------------- 0.0/102.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/102.5 MB ? eta -:--:--
   ---------------------------------------- 0.3/102.5 MB ? eta -:--:--
   ---------------------------------------- 0.3/102.5 MB ? eta -:--:--
   ---------------------------------------- 0.8/102.5 MB 1.1 MB/s eta 0:01:35
   ---------------------------------------- 1.0/102.5 MB 1.2 MB/s eta 0:01:27
    --------------------------------------- 1.3/102.5 MB 1.3 MB/s eta 0:01:20
    --------------------------------------- 1.6/102.5 MB 1.4 MB/s eta 0:01:14
    --------------------------------------- 2.1/102.5 MB 1.4 MB/s eta 0:01:11
    --------------------------------------- 2.4/102.5 MB 1.5 MB/s eta 0

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import xgboost as xgb
import catboost as cb
import pickle
import warnings
warnings.filterwarnings('ignore')

# This is a mock-up of your preprocessed data since I cannot access the actual file.
# In a real scenario, you would already have your `df` ready.
# We will create a sample DataFrame with the same structure you described.
data = {
    'tenure': np.random.rand(100) * 72,
    'MonthlyCharges': np.random.rand(100) * 100,
    'TotalCharges': np.random.rand(100) * 7000,
    'gender': np.random.randint(0, 2, 100),
    'Partner': np.random.randint(0, 2, 100),
    'Dependents': np.random.randint(0, 2, 100),
    'PhoneService': np.random.randint(0, 2, 100),
    'PaperlessBilling': np.random.randint(0, 2, 100),
    'Churn': np.random.randint(0, 2, 100)
}
for col in ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']:
    data[f'{col}_cat_1'] = np.random.randint(0, 2, 100)
    data[f'{col}_cat_2'] = np.random.randint(0, 2, 100)

df = pd.DataFrame(data)
df['TotalCharges'] = df['TotalCharges'].astype(float)


# --- Step 1: Prepare the data for modeling ---
# Separate features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store model results
model_results = {}
best_model = None
best_f1 = 0

# Function to train and evaluate a model
def train_and_evaluate(model, model_name):
    """
    Trains the given model, evaluates its performance, and stores the metrics.
    """
    print(f"--- Training {model_name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    model_results[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': cm
    }

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("-" * 30)

    global best_f1, best_model
    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        print(f"New best model: {model_name} with F1-score: {f1:.4f}")

# --- Step 2: Train and evaluate each model ---

# 1. Decision Tree
dt_model = DecisionTreeClassifier(random_state=20)
train_and_evaluate(dt_model, "Decision Tree")

# 2. Random Forest
rf_model = RandomForestClassifier(random_state=20)
train_and_evaluate(rf_model, "Random Forest")

# 3. AdaBoost
ab_model = AdaBoostClassifier(random_state=20)
train_and_evaluate(ab_model, "AdaBoost")

# 4. XGBoost
xgb_model = xgb.XGBClassifier(random_state=55, eval_metric='logloss', use_label_encoder=False)
train_and_evaluate(xgb_model, "XGBoost")

# 5. CatBoost
cb_model = cb.CatBoostClassifier(random_state=74, verbose=0)
train_and_evaluate(cb_model, "CatBoost")

# --- Step 3: Find the best model and save it ---
best_model_name = max(model_results, key=lambda k: model_results[k]['f1_score'])
print(f"\nConclusion: The best model is {best_model_name} with an F1-score of {model_results[best_model_name]['f1_score']:.4f}.")

# Save the best model using pickle
if best_model:
    with open('best_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
    print("The best model has been saved to 'best_model.pkl'.")

--- Training Decision Tree ---
Accuracy: 0.5000
Precision: 0.6000
Recall: 0.2727
F1-score: 0.3750
Confusion Matrix:
[[7 2]
 [8 3]]
------------------------------
New best model: Decision Tree with F1-score: 0.3750
--- Training Random Forest ---
Accuracy: 0.4500
Precision: 0.5000
Recall: 0.0909
F1-score: 0.1538
Confusion Matrix:
[[ 8  1]
 [10  1]]
------------------------------
--- Training AdaBoost ---
Accuracy: 0.5500
Precision: 0.7500
Recall: 0.2727
F1-score: 0.4000
Confusion Matrix:
[[8 1]
 [8 3]]
------------------------------
New best model: AdaBoost with F1-score: 0.4000
--- Training XGBoost ---
Accuracy: 0.5000
Precision: 0.5714
Recall: 0.3636
F1-score: 0.4444
Confusion Matrix:
[[6 3]
 [7 4]]
------------------------------
New best model: XGBoost with F1-score: 0.4444
--- Training CatBoost ---
Accuracy: 0.4500
Precision: 0.5000
Recall: 0.0909
F1-score: 0.1538
Confusion Matrix:
[[ 8  1]
 [10  1]]
------------------------------

Conclusion: The best model is XGBoost with an F1-sco