In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Telco-Customer-Churn.csv')

# Preview the first few rows
print(df.head())
print(df.info())

In [None]:
print(df.columns)

In [None]:
# Convert 'TotalCharges' to numeric (some are missing or blank)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill missing values
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


In [None]:
print(df.head())
print(df.info())

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd # Import pandas just in case it's needed for value_counts

# Reload the dataset to ensure 'Churn' column is present
df = pd.read_csv('Telco-Customer-Churn.csv')

# Check value counts of Churn at the start of the cell
print("Value counts of df['Churn'] at the start of the cell:")
print(df['Churn'].value_counts())

# Separate target variable
y = df['Churn']
X = df.drop('Churn', axis=1)

# Print value counts of y after separation
print("Value counts of y after separation:")
print(y.value_counts())

# Encode binary columns in X
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in binary_cols:
    X[col] = X[col].map({'Yes': 1, 'No': 0})

# One-hot encode remaining categorical variables in X
X = pd.get_dummies(X, drop_first=True)

# Encode the target variable y
y = y.map({'Yes': 1, 'No': 0})

# Print value counts of y after mapping
print("\nValue counts of y after mapping:")
print(y.value_counts())

# Keep X and y separate for model training

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import json
import numpy as np

# Train-test split on the prepared X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

joblib.dump(model, "model.joblib")

# Evaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

report_dict = classification_report(y_test, y_pred, output_dict=True)

# Save the classification report as a JSON file
with open("classification_report.json", "w") as f:
    json.dump(report_dict, f)

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm)
cm_df.to_csv('confusion_matrix.csv', index=False)

[[1432  107]
 [ 329  245]]
              precision    recall  f1-score   support

           0       0.81      0.93      0.87      1539
           1       0.70      0.43      0.53       574

    accuracy                           0.79      2113
   macro avg       0.75      0.68      0.70      2113
weighted avg       0.78      0.79      0.78      2113



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Feature importance
importances = pd.Series(model.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(10)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(x=top_features.values, y=top_features.index)
plt.title("Top 10 Features Influencing Churn")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
