In [None]:
%pip install tensorflow

In [1]:
#WA_Fn-UseC_-Telco-Customer-Churn.csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")#[:10000]

In [2]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

In [3]:
label_encoders = {}
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
rf_classifier = RandomForestClassifier(n_estimators=100)#, random_state=42)

In [4]:
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)#, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
#print(classification_report(y_test, y_pred_rf))
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga'],  # 'liblinear' and 'saga' solvers support both L1 and L2 penalties
            'max_iter': [100, 200, 500, 1000],  # Increase max_iter for convergence
            'class_weight': [None, 'balanced']  # Handle class imbalance
        }
    },'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 20, None],
            'class_weight': [None, 'balanced']
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf'],
            'class_weight': [None, 'balanced']
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [5, 10, 20, None],
            'min_samples_split': [2, 5, 10],
            'class_weight': [None, 'balanced']
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance']
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06, 1e-05]
        }
    }
}

In [5]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

# Define a function to perform RandomizedSearchCV
def perform_random_search(model, param_dist, X_train, y_train, n_iter=100):
    clf = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
    clf.fit(X_train, y_train)
    return clf.best_estimator_, clf.best_params_

# Define parameter distributions for RandomizedSearchCV
param_dists = {
    'LogisticRegression': {
        'C': stats.loguniform(0.001, 1000),
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200, 500, 1000],
        'class_weight': [None, 'balanced']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [5, 10, 20, None],
        'class_weight': [None, 'balanced']
    },
    'SVC': {
        'C': stats.loguniform(0.1, 100),
        'kernel': ['linear', 'rbf'],
        'class_weight': [None, 'balanced']
    },
    'DecisionTreeClassifier': {
        'max_depth': [5, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'class_weight': [None, 'balanced']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9, 11, 13],
        'weights': ['uniform', 'distance']
    },
    'GaussianNB': {
        'var_smoothing': stats.loguniform(1e-9, 1e-5)
    }
}

best_models = {}
best_params = {}

for model_name, param_dist in param_dists.items():
    model = models[model_name]['model']
    best_model, best_param = perform_random_search(model, param_dist, X_train, y_train)
    best_models[model_name] = best_model
    best_params[model_name] = best_param
    print(f"Best parameters for {model_name}: {best_param}\n")

# Evaluate models
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


Best parameters for LogisticRegression: {'C': 0.370770258634798, 'class_weight': None, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}





Best parameters for RandomForestClassifier: {'n_estimators': 100, 'max_depth': 10, 'class_weight': None}

Best parameters for SVC: {'C': 0.5163124910488018, 'class_weight': None, 'kernel': 'rbf'}





Best parameters for DecisionTreeClassifier: {'min_samples_split': 2, 'max_depth': 5, 'class_weight': None}





Best parameters for KNeighborsClassifier: {'weights': 'uniform', 'n_neighbors': 11}

Best parameters for GaussianNB: {'var_smoothing': 3.148911647956859e-08}


LogisticRegression Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1036
           1       0.64      0.56      0.60       373

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.73      1409
weighted avg       0.79      0.80      0.80      1409

Confusion Matrix:
[[919 117]
 [163 210]]

RandomForestClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1036
           1       0.68      0.55      0.61       373

    accuracy                           0.81      1409
   macro avg       0.76      0.73      0.74      1409
weighted avg       0.80      0.81      0.81      1409

Confusion Matrix:
[[940  96]
 [168 205]]

SVC Classifica

In [9]:
best_models = {}
for model_name, model_details in models.items():
    clf = GridSearchCV(model_details['model'], model_details['params'], cv=3, scoring='accuracy', n_jobs=-1)
    clf.fit(X_train, y_train)
    best_models[model_name] = clf.best_estimator_
    print(f"Best parameters for {model_name}: {clf.best_params_}\n")
    print(clf)
    print()

for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

Best parameters for LogisticRegression: {'C': 1, 'class_weight': None, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}

GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'class_weight': [None, 'balanced'],
                         'max_iter': [100, 200, 500, 1000],
                         'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']},
             scoring='accuracy')

Best parameters for RandomForestClassifier: {'class_weight': None, 'max_depth': 10, 'n_estimators': 50}

GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'class_weight': [None, 'balanced'],
                         'max_depth': [5, 10, 20, None],
                         'n_estimators': [50, 100, 200]},
             scoring='accuracy')

Best parameters for SVC: {'C': 0.1, 'class_weight': None, 'kernel': 'linear'}

GridSearchCV(cv=3, est

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=200)#, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1036
           1       0.67      0.53      0.59       373

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.73      1409
weighted avg       0.80      0.81      0.80      1409

Confusion Matrix:
[[938  98]
 [175 198]]


In [12]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Define base models
estimators = [
    ('rf', best_models['RandomForestClassifier']),
    ('svc', best_models['SVC']),
    ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
]

# Define the Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)

# Train the Stacking Classifier
stacking_clf.fit(X_train, y_train)
y_pred_stacking = stacking_clf.predict(X_test)

# Evaluate the Stacking Classifier
print("Stacking Classifier Classification Report:")
print(classification_report(y_test, y_pred_stacking))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_stacking))


Stacking Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.87      1036
           1       0.67      0.54      0.60       373

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Confusion Matrix:
[[938  98]
 [172 201]]


In [13]:
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)#, random_state=42)
gb_classifier.fit(X_train, y_train)
y_pred_gb = gb_classifier.predict(X_test)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))


from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Define base models
log_clf = LogisticRegression(C=0.1767016940294795, max_iter=500, penalty='l1', solver='saga')
svc_clf = SVC(C=0.10388823104027935, kernel='linear', probability=True)
dt_clf = DecisionTreeClassifier(max_depth=5, min_samples_split=10)

# Combine them into a voting classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('svc', svc_clf), ('dt', dt_clf)],
    voting='hard'  # 'hard' for majority voting, 'soft' for weighted average probabilities
)

# Train the VotingClassifier
voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the model
print("Voting Classifier Classification Report:")
print(classification_report(y_test, y_pred_voting))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_voting))


Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1036
           1       0.68      0.55      0.61       373

    accuracy                           0.81      1409
   macro avg       0.76      0.73      0.74      1409
weighted avg       0.80      0.81      0.81      1409

Confusion Matrix:
[[938  98]
 [166 207]]
Voting Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1036
           1       0.65      0.55      0.59       373

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.79      0.80      0.79      1409

Confusion Matrix:
[[924 112]
 [169 204]]


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define a neural network model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")
print("Neural Network Classification Report:")
print(classification_report(y_test, y_pred_nn))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nn))


In [14]:
from sklearn.metrics import accuracy_score

# Define a function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    return accuracy

# Evaluate the best models found by GridSearchCV
for model_name, model in best_models.items():
    print(f"\n{model_name} Model Evaluation:")
    accuracy = evaluate_model(model, X_test, y_test)
    print(f"Final Accuracy for {model_name}: {accuracy:.4f}")

# Evaluate the Stacking Classifier
print("Stacking Classifier Model Evaluation:")
stacking_accuracy = evaluate_model(stacking_clf, X_test, y_test)
print(f"Final Accuracy for Stacking Classifier: {stacking_accuracy:.4f}")

# Evaluate the Neural Network model
print("Neural Network Model Evaluation:")
nn_accuracy = evaluate_model(model, X_test, y_test)
print(f"Final Accuracy for Neural Network: {nn_accuracy:.4f}")




LogisticRegression Model Evaluation:
Accuracy: 0.8013
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1036
           1       0.64      0.56      0.60       373

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.73      1409
weighted avg       0.79      0.80      0.80      1409

Confusion Matrix:
[[919 117]
 [163 210]]
Final Accuracy for LogisticRegression: 0.8013

RandomForestClassifier Model Evaluation:
Accuracy: 0.8126
              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1036
           1       0.69      0.53      0.60       373

    accuracy                           0.81      1409
   macro avg       0.77      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Confusion Matrix:
[[946  90]
 [174 199]]
Final Accuracy for RandomForestClassifier: 0.8126

SVC Model Evaluation:
Accuracy: 0.7928
              

In [21]:
from sklearn.metrics import accuracy_score

# Define a function to evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    return accuracy

# Evaluate the best models found by GridSearchCV
for model_name, model in best_models.items():
    print(f"\n{model_name} Model Evaluation:")
    accuracy = evaluate_model(model, X_test, y_test)
    print(f"Final Accuracy for {model_name}: {accuracy:.4f}")



LogisticRegression Model Evaluation:
Accuracy: 0.8013
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1036
           1       0.64      0.56      0.60       373

    accuracy                           0.80      1409
   macro avg       0.75      0.73      0.73      1409
weighted avg       0.79      0.80      0.80      1409

Confusion Matrix:
[[919 117]
 [163 210]]
Final Accuracy for LogisticRegression: 0.8013

RandomForestClassifier Model Evaluation:
Accuracy: 0.8126
              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1036
           1       0.69      0.53      0.60       373

    accuracy                           0.81      1409
   macro avg       0.77      0.72      0.74      1409
weighted avg       0.80      0.81      0.80      1409

Confusion Matrix:
[[946  90]
 [174 199]]
Final Accuracy for RandomForestClassifier: 0.8126

SVC Model Evaluation:
Accuracy: 0.7928
              