In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import scipy.stats as stats

2024-07-03 15:26:37.700934: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")[:1000]

# Preprocessing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# Define categorical columns and encode them using OneHotEncoder
categorical_columns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
                       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                       'Contract', 'PaperlessBilling', 'PaymentMethod']

df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

In [3]:
# Apply one-hot encoding
one_hot_enc = OneHotEncoder()
transformer = ColumnTransformer([('one_hot_enc', one_hot_enc, categorical_columns)], remainder='passthrough')
df_transformed = transformer.fit_transform(df.drop(columns=['customerID', 'Churn']))

# Split data into features and target
X = pd.DataFrame(df_transformed)
y = df['Churn']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
# Define models and their hyperparameters
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': stats.loguniform(0.001, 1000),
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga'],
            'max_iter': [10000],
            'class_weight': [None, 'balanced']
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200, 300],
            'max_depth': [5, 10, 20, None],
            'class_weight': [None, 'balanced']
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': stats.loguniform(0.1, 100),
            'kernel': ['linear', 'rbf'],
            'class_weight': [None, 'balanced']
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [5, 10, 20, None],
            'min_samples_split': [2, 5, 10],
            'class_weight': [None, 'balanced']
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9, 11, 13],
            'weights': ['uniform', 'distance']
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': stats.loguniform(1e-9, 1e-5)
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(),
        'params': {
            'learning_rate': stats.loguniform(0.01, 0.2),
            'max_depth': [3, 5, 7, 10],
            'n_estimators': [100, 200, 300]
        }
    }
}

In [5]:
# Define a function to perform RandomizedSearchCV
def perform_random_search(model, param_dist, X_train, y_train, n_iter=100):
    clf = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
    clf.fit(X_train, y_train)
    return clf.best_estimator_, clf.best_params_

In [None]:
# Perform hyperparameter tuning
best_models = {}
for model_name, model_details in models.items():
    model = model_details['model']
    param_dist = model_details['params']
    best_model, best_params = perform_random_search(model, param_dist, X_train, y_train)
    best_models[model_name] = best_model
    print(f"Best parameters for {model_name}: {best_params}\n")

# Evaluate models
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

# Neural Network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

Best parameters for LogisticRegression: {'C': 4.418441521199722, 'class_weight': None, 'max_iter': 10000, 'penalty': 'l1', 'solver': 'liblinear'}

Best parameters for RandomForestClassifier: {'n_estimators': 300, 'max_depth': 10, 'class_weight': None}



In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=2, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")
print("Neural Network Classification Report:")
print(classification_report(y_test, y_pred_nn))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nn))

In [None]:
# Visualization
# Plotting the confusion matrix for the best model
best_model_name = max(best_models, key=lambda name: accuracy_score(y_test, best_models[name].predict(X_test)))
best_model = best_models[best_model_name]
conf_matrix = confusion_matrix(y_test, best_model.predict(X_test))
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title(f'{best_model_name} Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plotting feature importance for the best model
if hasattr(best_model, 'feature_importances_'):
    feature_importances = best_model.feature_importances_
    plt.figure(figsize=(12, 8))
    sns.barplot(x=feature_importances, y=range(len(feature_importances)))
    plt.title(f'{best_model_name} Feature Importances')
    plt.xlabel('Importance Score')
    plt.ylabel('Feature Index')
    plt.show()

# Plotting the training history of the neural network
plt.figure(figsize=(12, 8))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Neural Network Training History')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Additional visualizations
# Distribution of churn
plt.figure(figsize=(10, 7))
sns.countplot(x='Churn', data=df)
plt.title('Distribution of Churn')
plt.show()

# Distribution of contract types
plt.figure(figsize=(10, 7))
sns.countplot(x='Contract', data=df)
plt.title('Distribution of Contract Types')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()