<a href="https://colab.research.google.com/github/samsomsabu/Advanced-Data-Analytics-works/blob/main/PA_Lab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.impute import SimpleImputer

In [None]:
# Load Dataset
data = pd.read_csv('/content/customer_churn_dataset-training-master.csv')


In [None]:
# Drop 'CustomerID' as it's irrelevant to the prediction
data = data.drop('CustomerID', axis=1)
data = data.dropna(subset=['Churn'])

# Handle missing values: For numerical columns, fill missing values with the median. For categorical, use the most frequent value.
# Define numeric_cols and categorical_cols AFTER dropping 'Churn'
# Split data into features and target FIRST
X = data.drop('Churn', axis=1)
y = data['Churn']

# Now define numeric_cols and categorical_cols based on X (features only)
numeric_cols = X.select_dtypes(include=['float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Preprocessing for numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])



In [None]:
# Combine transformers for numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])



In [None]:
# Define Logistic Regression pipeline
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))])

# Define Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])

# Fit both models
logreg_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)



In [None]:
# Predictions
y_pred_logreg = logreg_pipeline.predict(X_test)
y_pred_rf = rf_pipeline.predict(X_test)



In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Model Evaluation
def evaluate_model(y_true, y_pred):
    # Check if y_true or y_pred contains continuous values
    if not all(value in (0, 1) for value in np.unique(y_true)):
        print("Warning: y_true contains continuous values. Converting to binary (0/1) using a threshold.")
        y_true = (y_true > 0.5).astype(int)  # Adjust threshold as needed
    if not all(value in (0, 1) for value in np.unique(y_pred)):
        print("Warning: y_pred contains continuous values. Converting to binary (0/1) using a threshold.")
        y_pred = (y_pred > 0.5).astype(int)  # Adjust threshold as needed

    # Check for and handle NaN values in y_true
    if pd.Series(y_true).isnull().any():  # Check if y_true contains NaN values
        print("Warning: y_true contains NaN values. Removing NaN values before evaluation.")
        # Option 1: Remove rows with NaN values
        # not_nan_indices = ~np.isnan(y_true)
        # y_true = y_true[not_nan_indices]
        # y_pred = y_pred[not_nan_indices]

        # Option 2: Impute NaN values with a suitable strategy (e.g., mean, median)
        y_true = pd.Series(y_true).fillna(pd.Series(y_true).mean()).values  # Impute with mean

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC: {auc:.4f}")

In [None]:
# Evaluate Logistic Regression
print("Logistic Regression Performance:")
evaluate_model(y_test, y_pred_logreg)



Logistic Regression Performance:
Accuracy: 0.8958
Precision: 0.9268
Recall: 0.8865
F1 Score: 0.9062
ROC-AUC: 0.8973


In [None]:
# Evaluate Random Forest
print("Random Forest Performance:")
evaluate_model(y_test, y_pred_rf)


Random Forest Performance:
Accuracy: 0.9988
Precision: 1.0000
Recall: 0.9979
F1 Score: 0.9989
ROC-AUC: 0.9989


In [None]:
logreg_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_logreg),
    'Precision': precision_score(y_test, y_pred_logreg),
    'Recall': recall_score(y_test, y_pred_logreg),
    'F1 Score': f1_score(y_test, y_pred_logreg),
    'ROC-AUC': roc_auc_score(y_test, y_pred_logreg)
}

rf_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_rf),
    'Precision': precision_score(y_test, y_pred_rf),
    'Recall': recall_score(y_test, y_pred_rf),
    'F1 Score': f1_score(y_test, y_pred_rf),
    'ROC-AUC': roc_auc_score(y_test, y_pred_rf)
}

# Print the metrics for easier comparison
print("Logistic Regression Metrics:", logreg_metrics)
print("Random Forest Metrics:", rf_metrics)

# Compare the models based on the metrics
if rf_metrics['F1 Score'] > logreg_metrics['F1 Score']:
    print("\nRandom Forest performs better than Logistic Regression based on F1 Score, which is better for imbalanced datasets.")
else:
    print("\nLogistic Regression performs better than Random Forest based on F1 Score.")

Logistic Regression Metrics: {'Accuracy': 0.8957958412098299, 'Precision': 0.9267824077296838, 'Recall': 0.8865095219070449, 'F1 Score': 0.9061987380629879, 'ROC-AUC': 0.897252311688302}
Random Forest Metrics: {'Accuracy': 0.9987977315689981, 'Precision': 0.9999733102021752, 'Recall': 0.9979091756558797, 'F1 Score': 0.9989401766372271, 'ROC-AUC': 0.9989370930763652}

Random Forest performs better than Logistic Regression based on F1 Score, which is better for imbalanced datasets.


## Customer Churn Prediction Report

**1. Introduction:**
This report summarizes the process and findings of a customer churn prediction project using a dataset of customer information.

**2. Data Preprocessing:**
- The 'CustomerID' column was dropped as it was not relevant for churn prediction.
- Missing values were handled by imputing the median for numerical features and the most frequent value for categorical features.
- Numerical features were standardized using StandardScaler.
- Categorical features were encoded using one-hot encoding.

**3. Model Selection and Training:**
- Two models were trained: Logistic Regression and Random Forest.
- Both models were trained using a pipeline that incorporated the data preprocessing steps mentioned above.

**4. Model Evaluation:**
- The models were evaluated on the test dataset using various metrics such as accuracy, precision, recall, F1-score, and ROC-AUC.
- Results for Logistic Regression and Random Forest models were compared to determine which model performed better in terms of each evaluation metric.


### Model Comparison and Explanation

Based on the metrics provided, **Random Forest** outperforms **Logistic Regression** in every key metric:

#### Logistic Regression:
- **Accuracy**: 89.58%
- **Precision**: 92.68%
- **Recall**: 88.65%
- **F1 Score**: 90.62%
- **ROC-AUC**: 89.73%

#### Random Forest:
- **Accuracy**: 99.88%
- **Precision**: 99.99%
- **Recall**: 99.79%
- **F1 Score**: 99.89%
- **ROC-AUC**: 99.89%

### Explanation:
- **F1 Score**: Since customer churn is often an imbalanced dataset (where churn is less frequent), the **F1 Score** is critical because it balances both precision and recall. In this case, Random Forest's F1 Score of 99.89% is significantly higher than Logistic Regression's 90.62%, indicating that Random Forest captures the minority class (churned customers) much more effectively.
  
- **Precision & Recall**: Random Forest shows near-perfect precision and recall, meaning that it correctly identifies almost all churners and very rarely makes incorrect positive predictions (false positives). Logistic Regression, while performing well, has slightly lower recall and precision, which results in a lower F1 score.

- **ROC-AUC**: This metric evaluates how well the model can distinguish between classes. A score close to 1 (such as Random Forest’s 99.89%) suggests excellent classification ability. Logistic Regression, with an ROC-AUC of 89.73%, also performs well but is not as effective at separating churners from non-churners compared to Random Forest.

### Conclusion:
The **Random Forest** model clearly outperforms **Logistic Regression** based on F1 Score and ROC-AUC, making it the better choice for this customer churn prediction task, especially in imbalanced datasets where identifying churners is crucial.