In [3]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd



base_df = pd.read_csv("/home/samir/Desktop/rudraAnalytics/sub_projects/churn/data/data.csv")
pd.set_option("display.max_columns", None)

df = base_df.copy()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['SeniorCitizen'] = df['SeniorCitizen'].astype(str)


def one_hot_encode(df, encode_set=[], dont_encode=[]):
    encoded_df = df.copy()
    for column in encode_set:
        if column in dont_encode:
            continue
        # print(len(df[column].unique()))
        if df[column].dtype == 'object':
            if len(df[column].unique()) == 2:
                # Binary encoding (0 and 1)
                encoded_df[column] = pd.get_dummies(df[column], drop_first=True)
            else:
                # One-hot encoding and using 0 and 1 instead of True and False
                one_hot_encoded = pd.get_dummies(df[column], prefix=column, drop_first=False)
                one_hot_encoded.columns = [f"{column}{i+1}" for i in range(one_hot_encoded.shape[1])]
                encoded_df = pd.concat([encoded_df, one_hot_encoded], axis=1)
                encoded_df.drop(column, axis=1, inplace=True)
                # print(encoded_df)

    return encoded_df

features = [
    'customerID',
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'tenure',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
    'MonthlyCharges',
    'TotalCharges',
    'Churn'
]
dont_label = ['customerID', 'tenure', 'MonthlyCharges', 'TotalCharges']

encoded_df = one_hot_encode(df.drop('customerID', axis=1), features, dont_label)
encoded_df['TotalCharges'].fillna(encoded_df['TotalCharges'].mean(), inplace=True)


X = encoded_df.drop('Churn', axis=1)
y = encoded_df['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the CatBoost model
catboost_model = CatBoostClassifier(random_state=42)

# Train the model
catboost_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_catboost = catboost_model.predict(X_test)

# Evaluate the model
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"CatBoost Test Accuracy: {accuracy_catboost * 100:.2f}%")



Learning rate set to 0.021554
0:	learn: 0.6776654	total: 2.22ms	remaining: 2.22s
1:	learn: 0.6637381	total: 4.34ms	remaining: 2.17s
2:	learn: 0.6505523	total: 8.12ms	remaining: 2.7s
3:	learn: 0.6381802	total: 10.3ms	remaining: 2.57s
4:	learn: 0.6267011	total: 13.1ms	remaining: 2.61s
5:	learn: 0.6144146	total: 15.8ms	remaining: 2.61s
6:	learn: 0.6044831	total: 17.9ms	remaining: 2.53s
7:	learn: 0.5940211	total: 19.9ms	remaining: 2.46s
8:	learn: 0.5847047	total: 22ms	remaining: 2.42s
9:	learn: 0.5756859	total: 24.4ms	remaining: 2.42s
10:	learn: 0.5673480	total: 26.9ms	remaining: 2.42s
11:	learn: 0.5598353	total: 29ms	remaining: 2.39s
12:	learn: 0.5528500	total: 31.1ms	remaining: 2.36s
13:	learn: 0.5467907	total: 33.3ms	remaining: 2.35s
14:	learn: 0.5409311	total: 38.3ms	remaining: 2.51s
15:	learn: 0.5346617	total: 40.5ms	remaining: 2.49s
16:	learn: 0.5289346	total: 43.8ms	remaining: 2.53s
17:	learn: 0.5235270	total: 46ms	remaining: 2.51s
18:	learn: 0.5176898	total: 48.2ms	remaining: 2.49s

TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=[False  True] and y_pred=['False' 'True']. Make sure that the predictions provided by the classifier coincides with the true labels.

LightGBM Test Accuracy: 79.49%