In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/Customer Churn Prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    if column != 'customerID':
        df[column] = label_encoder.fit_transform(df[column])

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['tenure', 'MonthlyCharges', 'TotalCharges']])
df[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaled_features

df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,-1.277445,0,1,0,0,...,0,0,0,0,0,1,2,-1.160323,-0.398608,0
1,5575-GNVDE,1,0,0,0,0.066327,1,0,0,2,...,2,0,0,0,1,0,3,-0.259629,-0.948762,0
2,3668-QPYBK,1,0,0,0,-1.236724,1,0,0,2,...,0,0,0,0,0,1,3,-0.36266,-1.641883,1
3,7795-CFOCW,1,0,0,0,0.514251,0,1,0,2,...,2,2,0,0,1,0,0,-0.746535,-0.98371,0
4,9237-HQITU,0,0,0,0,-1.236724,1,0,1,0,...,0,0,0,0,0,1,2,0.197365,-1.235224,1


In [None]:
# X-features, y-label(Churn)
X = df.drop(['Churn', 'customerID'], axis=1)
y = df['Churn']


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#Training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

#Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 79.99%


In [None]:
import pickle

model_path = '/content/drive/MyDrive/Customer Churn Prediction/churn_model.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

print("Model saved successfully to Google Drive.")


Model saved successfully to Google Drive.


In [None]:
from sklearn.model_selection import GridSearchCV

#parameters grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}


rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best Parameters found: ", grid_search.best_params_)
print("Best Cross-validation Score: ", grid_search.best_score_)

best_rf_model = grid_search.best_estimator_

# Evaluate the new model
y_pred_best = best_rf_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

print(f"Improved Model Accuracy after Hyperparameter Tuning: {accuracy_best * 100:.2f}%")


Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters found:  {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-validation Score:  0.8045793397231096
Improved Model Accuracy after Hyperparameter Tuning: 81.26%


In [None]:
import pickle

model_path = '/content/drive/MyDrive/Customer Churn Prediction/random_forest_churn_model_v2.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

print("Improved Random Forest model saved successfully to Google Drive.")


Improved Random Forest model saved successfully to Google Drive.


In [12]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, plot_confusion_matrix
import matplotlib.pyplot as plt

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_best)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred_best)
print("\nClassification Report:")
print(class_report)

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, y_pred_best)
print("\nROC-AUC Score:", roc_auc)

# Plot Confusion Matrix
plot_confusion_matrix(best_rf_model, X_test, y_test)
plt.title('Confusion Matrix')
plt.savefig('/content/drive/MyDrive/Customer Churn Prediction/confusion_matrix.png')  # Save plot
plt.show()


ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (/usr/local/lib/python3.11/dist-packages/sklearn/metrics/__init__.py)