### Import the Required Libraries - Pandas, Numpy, Matplotlib, Sklearn etc

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Loading theChurn Dataset 

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
try:
    df = pd.read_csv('DMML_TRAINING_DATA_churn_data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: FILE not found. Please ensure the file is in the same directory.")
    exit() # Exit if the file is not found

## Data Exploration and Visualization -Performing initial analysis

#### Display the first 5 rows of the dataframe

In [0]:
print("\nInitial Data Head:")
print(df.head())

#### Data Information & schema

In [0]:
df.info()

#### Descriptive Statistics: includes count, mean , std and 5 point summary for all numeric columns

In [0]:
print(df.describe())

#### checking for Missing Values

In [0]:
print(df.isnull().sum())

## Data cleaning

#### Checking for Duplicate user_ids

In [0]:
print('Check for Duplicate CustomerID')

if df['CustomerID'].duplicated().any():
    print("\nWarning: Duplicate user_ids found. Considering unique user_ids for analysis.")
    df.drop_duplicates(subset=['CustomerID'], inplace=True)
    print(f"Removed duplicates. New shape: {df.shape}")
else:
    print('No Duplicate User id found')

### Bar chart for the distribution of Target Variable (Churn)

In [0]:

# 1. Bar chart for the distribution of the target variable (will_churn)
plt.figure(figsize=(7, 5))
sns.countplot(x='Churn', data=df, palette='viridis')
plt.title('Distribution of Target Variable (will_churn)')
plt.xlabel('Will Churn')
plt.ylabel('Number of Customers')
plt.xticks(ticks=[0, 1], labels=['No Churn (FALSE)', 'Churn (TRUE)'])
plt.show()

### Checking the Distribution of Target Variable (Churn) 

In [0]:
churn_counts = df['Churn'].value_counts()

print(f"\nDistribution of 'will_churn':\n{churn_counts}")
print(f"Churn percentage: {churn_counts['Yes'] / len(df) * 100:.2f}%")
print(f"\nNo Churn percentage: {churn_counts['No'] / len(df) * 100:.2f}%")

if churn_counts['Yes'] / len(df) > 0.6 or churn_counts['Yes'] / len(df) < 0.4:
    print("The dataset appears to be imbalanced.")
else:
    print("The dataset appears to be relatively balanced.")

### Histograms for numerical features

In [0]:

numerical_features = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']

plt.figure(figsize=(15, 5))

for i, feature in enumerate(numerical_features):
    plt.subplot(1, len(numerical_features), i + 1)
    sns.histplot(df[feature], kde=True, bins=20, color='skyblue')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    
plt.tight_layout()
plt.show()


### Bar charts for categorical features 

In [0]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('CustomerID')

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

# Plot each categorical variable
for i, col in enumerate(categorical_cols):
    sns.countplot(x=col, data=df, palette='pastel', ax=axes[i])
    axes[i].set_title(f'Distribution of {col}', fontsize=14)
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Number of Customers')
    axes[i].tick_params(axis='x', rotation=30)  # Rotate x-axis labels

# Adjust layout
plt.tight_layout()
plt.show()

 ### Correlation heatmap for numerical features

In [0]:
df_corr = df[numerical_features].copy()
df_corr['Tenure'] = df['Tenure'].astype(int)
df_corr['MonthlyCharges'] = df['MonthlyCharges'].astype(int)
df_corr['TotalCharges'] = df['TotalCharges'].astype(int)

In [0]:
plt.figure(figsize=(10, 8))
sns.heatmap(df_corr.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

## Data Preprocessing

### In Preprocessing step we will do following operations:

* Define features (X) and target (y)
* Handle Categorical Data: Convert categorical features into numerical format using One-Hot Encoding.
* Feature Scaling: Apply StandardScaler numerical features.
* Divide data into a training set and a testing set.

#### Define features (X) and target (y)

In [0]:
print("Droping user_id as it iss an identifier and not a predictive feature")
X = df.drop(['CustomerID', 'Churn'], axis=1) # Features

y = df['Churn'] # Target Variable 

#### Convert categorical features into numerical format using One-Hot Encoding

In [0]:
from sklearn.preprocessing import OneHotEncoder
import joblib




In [0]:
from sklearn.preprocessing import OneHotEncoder
import joblib

categorical_cols = ['Gender', 'ContractType', 'PaymentMethod']

# Correct way for scikit-learn 1.2+
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(X[categorical_cols])

# Save the encoder
joblib.dump(encoder, 'encoder.pkl')


# Identify categorical columns for one-hot encoding
categorical_cols =['Gender','ContractType','PaymentMethod']


# Apply One-Hot Encoding
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True) # drop_first avoids multicollinearity

print("\nFeatures after One-Hot Encoding and Boolean Conversion:")
print(X.head())
print(f"Shape after encoding: {X.shape}")


In [0]:
X

#### Scaling - of numerical features.

Explanation for Feature Scaling:

Feature scaling is crucial because many machine learning algorithms (like Logistic Regression, SVMs, Neural Networks, and even distance-based algorithms like K-Nearest Neighbors) are sensitive to the magnitude and range of input features.
If features have vastly different scales, the feature with a larger range might dominate the cost function or distance calculations, leading to suboptimal model performance.
StandardScaler transforms the data to have a mean of 0 and a standard deviation of 1. This ensures that all features contribute equally to the model, preventing features with larger numerical values from disproportionately influencing the model's learning process.
For tree-based models like Random Forest, scaling is less critical but can sometimes still offer minor benefits or consistency in pipelines.


In [0]:
# Identify numerical features that need scaling (excluding the one-hot encoded ones which are already 0/1)
# 'age', 'monthly_watch_hours', 'devices_used', 'customer_support_calls', 'last_active_days_ago'

features_to_scale = ['Age', 'Tenure', 'MonthlyCharges', 'TotalCharges']

scaler = StandardScaler()
X[features_to_scale] = scaler.fit_transform(X[features_to_scale])

print("\nFeatures after Scaling (first 5 rows of scaled columns):")
print(X[features_to_scale].head())

### Data Split: into a training set and a testing set.

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(f"\nTraining set shape (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Testing set shape (X_test, y_test): {X_test.shape}, {y_test.shape}")

## Model Building and Training

### Model 1 (Baseline): Training a Logistic Regression model.

In [0]:
print("\nTraining Logistic Regression Model...")

logistic_model = LogisticRegression(random_state=42, solver='liblinear') 
logistic_model.fit(X_train, y_train)

print("Logistic Regression Model Trained.")

### Model 2 (Advanced): Training a Random Forest Classifier.

In [0]:
print("\nTraining Random Forest Classifier Model...")

random_forest_model = RandomForestClassifier(random_state=42, n_estimators=100) # n_estimators = number of trees
random_forest_model.fit(X_train, y_train)

print("Random Forest Classifier Model Trained.")

## Models Evaluation - LR & RFC

###  Making predictions on test data using both trained models.

In [0]:
# prediction by logistic regression model
y_pred_logistic = logistic_model.predict(X_test)

# prediction by random forest model 
y_pred_rf = random_forest_model.predict(X_test)

### Calculating Accuracy, Precision, Recall, and F1-Score for each model.

### Logistic Regression Model Evaluation

In [0]:
print("\n--- Logistic Regression Model Evaluation ---")

accuracy_lr = accuracy_score(y_test, y_pred_logistic)
precision_lr = precision_score(y_test, y_pred_logistic,pos_label="Yes")
recall_lr = recall_score(y_test, y_pred_logistic,pos_label="Yes")
f1_lr = f1_score(y_test, y_pred_logistic,pos_label="Yes")

print(f"Accuracy: {accuracy_lr:.4f}")
print(f"Precision: {precision_lr:.4f}")
print(f"Recall: {recall_lr:.4f}")
print(f"F1-Score: {f1_lr:.4f}")
print("\nConfusion Matrix (Logistic Regression):")

print(confusion_matrix(y_test, y_pred_logistic))

### Random Forest Classifier Model Evaluation

In [0]:
print("\n--- Random Forest Classifier Model Evaluation ---")

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf,pos_label="Yes")
recall_rf = recall_score(y_test, y_pred_rf,pos_label="Yes")
f1_rf = f1_score(y_test, y_pred_rf,pos_label="Yes")

print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")

print(f"F1-Score: {f1_rf:.4f}")
print("\nConfusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))

#### To identify - which metric is more important for us in the context of the given problem 
* Is it more costly for Streamify to mistakenly predict a customer will stay (false negative)
* or to mistakenly predict a customer will leave (false positive)?

In the context of Streamify's customer churn, Below is the cost of different types of errors:
  1. False Negative (FN): The model predicts a customer will NOT churn, but they actually DO churn. **Cost**: Streamify loses a customer without intervention. This means lost revenue, potential negative word-of-mouth, and a missed opportunity to retain them (e.g., through targeted promotions or support). This is generally considered a high cost.
  2. False Positive (FP): The model predicts a customer WILL churn, but they actually DO NOT churn. Cost: Streamify invests resources (e.g., discounts, special offers, personalized outreach) in a customer who would have stayed anyway. This is a wasted resource but does not directly lead to customer loss.

Given these considerations:
  **False Negatives (missing actual churners) are generally more costly** for Streamify.
 **Losing a customer directly impacts revenue and growth**. Streamify wants to identify as many potential churners as possible to intervene.
 **Recall** is the metric that measures the proportion of actual positive cases (actual churners) that were correctly identified by the model (TP / (TP + FN)).

**A high Recall means the model is good at catching most of the customers who are going to churn**.Therefore, **Recall is the most important metric for Streamify in this scenario**.
While Precision is also important (to avoid wasting resources on FPs), maximizing Recall ensures Streamify doesn't miss out on opportunities to retain valuable customers.

The F1-Score provides a balance between Precision and Recall, which can also be a good overall indicator, especially if there's a slight imbalance.

In [0]:
print("\n--- Model Selection Based on Recall ---")
print(f"Logistic Regression Recall: {recall_lr:.4f}")
print(f"Random Forest Recall: {recall_rf:.4f}")

#### Based on Recall, the **Logistic Regression Model** is the better model as it has a higher recall score.

In [0]:

print("\nBased on Recall, the **Logistic Regression Model** is the better model as it has a higher recall score.")
best_model = logistic_model
best_model_name = "Logistic Regression"
best_model_metrics = {
    "Accuracy": accuracy_lr,
    "Precision": precision_lr,
    "Recall": recall_lr,
    "F1-Score": f1_lr
}

print(f"\nSelected Best Model: {best_model_name}")
print(best_model_metrics)

## Final Report and Recommendations

**Feature importance** : Feature importances are typically extracted from tree-based models (like Random Forest).Since we have chosen Logistic Regression as the best model, we cannot directly plot feature importances in the same way.
For Logistic Regression, coefficients can indicate feature importance, but they are interpreted differently (impact on log-odds).

### Conclusion: 

This Assignment aimed to predict customer churn for Streamify using a machine learning approach. We performed comprehensive data exploration, preprocessing, model building, and evaluation.

Based on our analysis, the **Logistic Regression** is chosen as the best model.

The primary reason for this choice was its superior performance in **Recall**, which is identified as the most critical metric for Streamify. Maximizing Recall helps Streamify identify as many potential churners as possible, allowing for timely intervention and customer retention efforts, thereby minimizing revenue loss from unaddressed churn.

Performance of the Logistic Regression on the test set:

In [0]:
best_model_metrics

## Saving the model as pickel file 

In [0]:
import joblib

# Save the model
joblib.dump(logistic_model, "linear_regression_churn_model.pkl")

In [0]:
X_train