**1. Importing the dependencies**

**2. Data Loading and Understanding**

In [None]:
# load teh csv data to a pandas dataframe
df = pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
df.shape

In [None]:
df.head()

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
df.head(2)

In [None]:
df.info()

In [None]:
# dropping customerID column as this is not required for modelling
df = df.drop(columns=["customerID"])

In [None]:
df.head(2)

In [None]:
df.columns

In [None]:
print(df["gender"].unique())

In [None]:
print(df["SeniorCitizen"].unique())

In [None]:
# printing the unique values in all the columns

numerical_features_list = ["tenure", "MonthlyCharges", "TotalCharges"]

for col in df.columns:
  if col not in numerical_features_list:
    print(col, df[col].unique())
    print("-"*50)

In [None]:
print(df.isnull().sum())

In [None]:
#df["TotalCharges"] = df["TotalCharges"].astype(float)

In [None]:
df[df["TotalCharges"]==" "]

In [None]:
len(df[df["TotalCharges"]==" "])

In [None]:
df["TotalCharges"] = df["TotalCharges"].replace({" ": "0.0"})

In [None]:
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [None]:
df.info()

In [None]:
# checking the class distribution of target column
print(df["Churn"].value_counts())

**Insights:**
1. Customer ID removed as it is not required for modelling
2. No mmissing values in the dataset
3. Missing values in the TotalCharges column were replaced with 0
4. Class imbalance identified in the target

**3. Exploratory Data Analysis (EDA)**

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(2)

In [None]:
df.describe()

**Numerical Features - Analysis**

Understand the distribution of teh numerical features

In [None]:
def plot_histogram(df, column_name):

  plt.figure(figsize=(5, 3))
  sns.histplot(df[column_name], kde=True)
  plt.title(f"Distribution of {column_name}")

  # calculate the mean and median values for the columns
  col_mean = df[column_name].mean()
  col_median = df[column_name].median()

  # add vertical lines for mean and median
  plt.axvline(col_mean, color="red", linestyle="--", label="Mean")
  plt.axvline(col_median, color="green", linestyle="-", label="Median")

  plt.legend()

  plt.show()

In [None]:
plot_histogram(df, "tenure")

In [None]:
plot_histogram(df, "MonthlyCharges")

In [None]:
plot_histogram(df, "TotalCharges")

**Box plot for numerical features**

In [None]:
def plot_boxplot(df, column_name):

  plt.figure(figsize=(5, 3))
  sns.boxplot(y=df[column_name])
  plt.title(f"Box Plot of {column_name}")
  plt.ylabel(column_name)
  plt.show

In [None]:
plot_boxplot(df, "tenure")

In [None]:
plot_boxplot(df, "MonthlyCharges")

In [None]:
plot_boxplot(df, "TotalCharges")

**Correlation Heatmap for numerical columns**

In [None]:
# correlation matrix - heatmap
plt.figure(figsize=(8, 4))
sns.heatmap(df[["tenure", "MonthlyCharges", "TotalCharges"]].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

Categorical features - Analysis

In [None]:
df.columns

In [None]:
df.info()

Countplot for categorical columns

In [None]:
object_cols = df.select_dtypes(include="object").columns.to_list()

object_cols = ["SeniorCitizen"] + object_cols

for col in object_cols:
  plt.figure(figsize=(5, 3))
  sns.countplot(x=df[col])
  plt.title(f"Count Plot of {col}")
  plt.show()

**4. Data Preprocessing**

In [None]:
df.head(3)

Label encoding of target column

In [None]:
df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})

In [None]:
df.head(3)

In [None]:
print(df["Churn"].value_counts())

Label encoding of categorical fetaures

In [None]:
# identifying columns with object data type
object_columns = df.select_dtypes(include="object").columns

In [None]:
print(object_columns)

In [None]:
# initialize a dictionary to save the encoders
encoders = {}

# apply label encoding and store the encoders
for column in object_columns:
  label_encoder = LabelEncoder()
  df[column] = label_encoder.fit_transform(df[column])
  encoders[column] = label_encoder


# save the encoders to a pickle file
with open("encoders.pkl", "wb") as f:
  pickle.dump(encoders, f)


In [None]:
encoders

In [None]:
df.head()

**Traianing and test data split**

In [None]:
# splitting the features and target
X = df.drop(columns=["Churn"])
y = df["Churn"]

In [None]:
# split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(y_train.shape)

In [None]:
print(y_train.value_counts())

Synthetic Minority Oversampling TEchnique (SMOTE)

In [None]:
smote = SMOTE(random_state=42)

In [None]:
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
print(y_train_smote.shape)

In [None]:
print(y_train_smote.value_counts())

**5. Model Training**

Training with default hyperparameters

In [None]:
# dictionary of models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42)
}

In [None]:
# dictionary to store the cross validation results
cv_scores = {}

# perform 5-fold cross validation for each model
for model_name, model in models.items():
  print(f"Training {model_name} with default parameters")
  scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring="accuracy")
  cv_scores[model_name] = scores
  print(f"{model_name} cross-validation accuracy: {np.mean(scores):.2f}")
  print("-"*70)

In [None]:
cv_scores

Random Forest gives the highest accuracy compared to other models with default parameters

In [None]:
rfc = RandomForestClassifier(random_state=42)

In [None]:
rfc.fit(X_train_smote, y_train_smote)

In [None]:
print(y_test.value_counts())

**6. Model Evaluation**

In [None]:
# evaluate on test data
y_test_pred = rfc.predict(X_test)

print("Accuracy Score:\n", accuracy_score(y_test, y_test_pred))
print("Confsuion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))

In [None]:
# save the trained model as a pickle file
model_data = {"model": rfc, "features_names": X.columns.tolist()}


with open("customer_churn_model.pkl", "wb") as f:
  pickle.dump(model_data, f)

**7. Load the saved  model and  build a Predictive System**

In [None]:
# load teh saved model and the feature names

with open("customer_churn_model.pkl", "rb") as f:
  model_data = pickle.load(f)

loaded_model = model_data["model"]
feature_names = model_data["features_names"]

In [None]:
print(loaded_model)

In [None]:
print(feature_names)

In [None]:
encoders

**To do:**
1. Implement Hyperparameter Tuining
2. Try Model Selection
3. Try downsampling
4. Try to address teh overfitting
5. Try Startified k fold CV

In [37]:

# import pandas as pd

# try:
#     # Load the dataset
#     df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
#     print("Dataset loaded successfully!")
# except FileNotFoundError:
#     print("Dataset not found. Please check the file path.")
#     raise

# # Check the unique values in the 'Churn' column to ensure they are 'Yes' or 'No'
# print("Unique values in the 'Churn' column:", df['Churn'].unique())

# # Clean the 'Churn' column
# df['Churn'] = df['Churn'].str.strip()

# # Convert 'TotalCharges' to numeric
# df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# # Fill missing values in 'TotalCharges'
# df['TotalCharges'] = df['TotalCharges'].fillna(0)

# # Filter for churned customers
# churned_customers = df[df['Churn'] == 'Yes']
# print("\nFirst 10 rows of churned customers:")
# print(churned_customers.head(10))

# # Splitting the data
# from sklearn.model_selection import train_test_split

# # Select features and target
# X = df.drop(columns=['customerID', 'Churn'])  # Drop irrelevant and target columns
# y = df['Churn'].map({'No': 0, 'Yes': 1})      # Encode 'Churn' as binary values

# # One-hot encode categorical variables
# X = pd.get_dummies(X, drop_first=True)
# print("\nFeatures after one-hot encoding:")
# print(X.head())

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# print("\nData split into training and testing sets.")

# # Model Selection and Hyperparameter Tuning
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV

# # Define parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20, 30],
#     'min_samples_split': [2, 5, 10]
# }

# # Grid search
# grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)
# print("\nStarting Grid Search...")
# grid_search.fit(X_train, y_train)
# print("Grid Search Completed!")

# # Best parameters
# print("Best hyperparameters:", grid_search.best_params_)

# # Evaluate the model
# from sklearn.metrics import classification_report

# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)

# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))


















# data for all customer

# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import classification_report, confusion_matrix
# from imblearn.over_sampling import RandomOverSampler
# import matplotlib.pyplot as plt

# # Load dataset
# print("Loading dataset...")
# df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')  # Replace with your dataset path
# print("Dataset loaded successfully!")

# # Check the first few rows of the dataset
# print("\nFirst few rows of the dataset:")
# print(df.head())

# # Check for missing values
# print("\nMissing values in each column:")
# print(df.isnull().sum())

# # Preprocess 'TotalCharges' and handle missing values
# df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# df['TotalCharges'] = df['TotalCharges'].fillna(0)  # Fill missing values with 0

# # Check if there are any missing values now
# print("\nMissing values after preprocessing 'TotalCharges':")
# print(df.isnull().sum())

# # Encode categorical features using one-hot encoding
# df_encoded = pd.get_dummies(df, drop_first=True)
# print("\nFeatures after one-hot encoding:")
# print(df_encoded.head())

# # Create new feature: average monthly spend
# df_encoded['avg_monthly_spend'] = df_encoded['TotalCharges'] / df_encoded['tenure']
# print("\nAdded 'avg_monthly_spend' feature:")
# print(df_encoded[['TotalCharges', 'tenure', 'avg_monthly_spend']].head())

# # Split data into features and target
# print("Splitting data into features and target...")
# X = df_encoded.drop(columns=['Churn_Yes'])
# y = df_encoded['Churn_Yes']

# # Feature scaling for numeric columns
# scaler = StandardScaler()
# X[['tenure', 'MonthlyCharges', 'TotalCharges', 'avg_monthly_spend']] = scaler.fit_transform(X[['tenure', 'MonthlyCharges', 'TotalCharges', 'avg_monthly_spend']])

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# print("\nData split into training and testing sets.")

# # Downsampling majority class to balance the dataset
# print("\nDownsampling majority class...")
# ros = RandomOverSampler(sampling_strategy=0.5, random_state=42)
# X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# # Initialize classifiers for model selection
# models = {
#     'RandomForest': RandomForestClassifier(class_weight='balanced', random_state=42),
#     'GradientBoosting': GradientBoostingClassifier(random_state=42)
# }

# # Define hyperparameters grid for RandomizedSearchCV
# param_grid_rf = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }

# param_grid_gb = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 5, 7],
# }

# # Start Hyperparameter Tuning with Stratified K-Fold Cross Validation and RandomizedSearchCV
# best_models = {}
# for model_name, model in models.items():
#     print(f"\nStarting {model_name} model hyperparameter tuning...")

#     # Setup RandomizedSearchCV for each model
#     if model_name == 'RandomForest':
#         random_search = RandomizedSearchCV(
#             estimator=model,
#             param_distributions=param_grid_rf,
#             n_iter=5,
#             cv=3,
#             verbose=1,
#             random_state=42,
#             n_jobs=-1
#         )
#     else:
#         random_search = RandomizedSearchCV(
#             estimator=model,
#             param_distributions=param_grid_gb,
#             n_iter=5,
#             cv=3,
#             verbose=1,
#             random_state=42,
#             n_jobs=-1
#         )

#     # Fit the RandomizedSearchCV model
#     random_search.fit(X_resampled, y_resampled)

#     # Store the best model from RandomizedSearchCV
#     best_models[model_name] = random_search.best_estimator_
#     print(f"Best hyperparameters for {model_name}: {random_search.best_params_}")

# # Evaluate the best model from each type
# for model_name, model in best_models.items():
#     print(f"\nEvaluating {model_name} on the test set...")

#     # Predict on the test set
#     y_pred = model.predict(X_test)

#     # Evaluate the model with a classification report
#     print(f"\n{model_name} Classification Report:")
#     print(classification_report(y_test, y_pred))

#     # Confusion matrix
#     cm = confusion_matrix(y_test, y_pred)
#     print(f"\nConfusion Matrix for {model_name}:")
#     print(cm)

#     # Plotting the confusion matrix
#     plt.figure(figsize=(5, 5))
#     plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
#     plt.title(f'Confusion Matrix - {model_name}')
#     plt.colorbar()
#     tick_marks = np.arange(2)
#     plt.xticks(tick_marks, ['Not Churn', 'Churn'], rotation=45)
#     plt.yticks(tick_marks, ['Not Churn', 'Churn'])
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')
#     plt.tight_layout()
#     plt.show()

# # Implementing Stratified K-Fold Cross Validation (for final model evaluation)
# print("\nStarting Stratified K-Fold Cross Validation...")
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for model_name, model in best_models.items():
#     print(f"\nEvaluating {model_name} using Stratified K-Fold Cross Validation...")
    
#     # Cross-validation
#     cv_scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='accuracy')
#     print(f"\nCross-validated scores for {model_name}: {cv_scores}")
#     print(f"Mean accuracy: {cv_scores.mean()}")





# for churn=yes customer only
import pandas as pd

# Load dataset (assuming your dataset path is 'WA_Fn-UseC_-Telco-Customer-Churn.csv')
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Select rows where churn='Yes'
churned_customers = df[df['Churn'] == 'Yes']

# Display the selected rows
print(churned_customers)

      customerID  gender  SeniorCitizen Partner Dependents  tenure  \
2     3668-QPYBK    Male              0      No         No       2   
4     9237-HQITU  Female              0      No         No       2   
5     9305-CDSKC  Female              0      No         No       8   
8     7892-POOKP  Female              0     Yes         No      28   
13    0280-XJGEX    Male              0      No         No      49   
...          ...     ...            ...     ...        ...     ...   
7021  1699-HPSBG    Male              0      No         No      12   
7026  8775-CEBBJ  Female              0      No         No       9   
7032  6894-LFHLY    Male              1      No         No       1   
7034  0639-TSIQW  Female              0      No         No      67   
7041  8361-LTMKD    Male              1     Yes         No       4   

     PhoneService MultipleLines InternetService OnlineSecurity  ...  \
2             Yes            No             DSL            Yes  ...   
4             Yes