<a href="https://colab.research.google.com/github/rithuzwar/Customer-Churn-Prediction-and-Analysis/blob/main/PROJECT_1_CHURN_PREDICTION_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

df = pd.read_csv('Churn_Modelling.csv')
print(df.head())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [2]:
df = df.drop(['RowNumber', 'Surname'], axis=1)

print(df.head())

   CustomerId  CreditScore Geography  Gender  Age  Tenure    Balance  \
0    15634602          619    France  Female   42       2       0.00   
1    15647311          608     Spain  Female   41       1   83807.86   
2    15619304          502    France  Female   42       8  159660.80   
3    15701354          699    France  Female   39       1       0.00   
4    15737888          850     Spain  Female   43       2  125510.82   

   NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0              1          1               1        101348.88       1  
1              1          0               1        112542.58       0  
2              3          1               0        113931.57       1  
3              2          0               0         93826.63       0  
4              1          1               1         79084.10       0  


In [4]:
# Features and Target
X = df.drop('Exited', axis=1)  # All columns except the target
y = df['Exited']               # Target column

In [5]:
from sklearn.model_selection import train_test_split

# Split into training and testing sets
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train_raw.shape, X_test_raw.shape)


(8000, 11) (2000, 11)


In [6]:
# One-hot encode Geography and Gender
X_train_encoded = pd.get_dummies(X_train_raw, drop_first=True)
X_test_encoded = pd.get_dummies(X_test_raw, drop_first=True)

# Reindex test set columns to match train columns (in case dummy columns mismatch)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

print(X_train_encoded.head())


      CustomerId  CreditScore  Age  Tenure    Balance  NumOfProducts  \
9254    15601116          686   32       6       0.00              2   
1561    15766374          632   42       4  119624.60              2   
1670    15716994          559   24       3  114739.92              1   
6087    15730759          561   27       9  135637.00              1   
6669    15797900          517   56       9  142147.32              1   

      HasCrCard  IsActiveMember  EstimatedSalary  Geography_Germany  \
9254          1               1        179093.26              False   
1561          1               1        195978.86               True   
1670          1               0         85891.02              False   
6087          1               0        153080.40              False   
6669          0               0         39488.04              False   

      Geography_Spain  Gender_Male  
9254            False         True  
1561            False         True  
1670             True        

In [7]:
#3.2
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Scale the encoded data
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

print(X_train_scaled[:5])  # Preview scaled data


[[-1.24645641  0.35649971 -0.6557859   0.34567966 -1.21847056  0.80843615
   0.64920267  0.97481699  1.36766974 -0.57946723 -0.57638802  0.91324755]
 [ 1.05381124 -0.20389777  0.29493847 -0.3483691   0.69683765  0.80843615
   0.64920267  0.97481699  1.6612541   1.72572313 -0.57638802  0.91324755]
 [ 0.3664786  -0.96147213 -1.41636539 -0.69539349  0.61862909 -0.91668767
   0.64920267 -1.02583358 -0.25280688 -0.57946723  1.73494238  0.91324755]
 [ 0.5580771  -0.94071667 -1.13114808  1.38675281  0.95321202 -0.91668767
   0.64920267 -1.02583358  0.91539272 -0.57946723 -0.57638802 -1.09499335]
 [ 1.49262956 -1.39733684  1.62595257  1.38675281  1.05744869 -0.91668767
  -1.54035103 -1.02583358 -1.05960019 -0.57946723 -0.57638802  0.91324755]]


In [8]:
#4.1
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression(max_iter=5000, random_state=42)
model.fit(X_train_scaled, y_train)

print("Model training complete. Ready for predictions!")


Model training complete. Ready for predictions!


In [10]:
# Make predictions
y_pred = model.predict(X_test_scaled)

# Predict probabilities (for churn probability insights)
probs = model.predict_proba(X_test_scaled)[:, 1]


In [11]:
# Create results DataFrame using unscaled, human-readable test set
results_df = X_test_raw.copy()

# Add Actual Churn
results_df['Actual_Churn'] = y_test.reset_index(drop=True)

# Add Predicted Churn
import pandas as pd
results_df['Predicted_Churn'] = pd.Series(y_pred).reset_index(drop=True)

# Add Churn Probability
results_df['Churn_Probability'] = probs

# Save to CSV
results_df.to_csv('rf_churn_results_for_powerbi.csv', index=False)

print("Results exported successfully for Power BI! 🚀")


Results exported successfully for Power BI! 🚀


In [13]:
#SMOTE
# 1) Import Counter
from collections import Counter

# 2) Apply SMOTE to the scaled training data
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# 3) Now you can inspect the class distribution
print(f"Resampled dataset shape: {Counter(y_train_res)}")

Resampled dataset shape: Counter({0: 6356, 1: 6356})


In [14]:
#XG BOOST
# 4) Train XGBoost
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_res, y_train_res)

# 5) Predict on the (scaled) test set
y_pred_xgb   = xgb_model.predict(X_test_scaled)
probs_xgb    = xgb_model.predict_proba(X_test_scaled)[:, 1]

# 6) Evaluate
from sklearn.metrics import classification_report, accuracy_score

print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")


Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.91      1607
           1       0.66      0.58      0.62       393

    accuracy                           0.86      2000
   macro avg       0.78      0.75      0.77      2000
weighted avg       0.85      0.86      0.86      2000

XGBoost Accuracy: 0.8590


In [15]:
results_df = X_test_raw.copy()
results_df['Actual_Churn']      = y_test.reset_index(drop=True)
results_df['Predicted_Churn']   = pd.Series(y_pred_xgb).reset_index(drop=True)
results_df['Churn_Probability'] = probs_xgb
results_df.to_csv('xgb_churn_results_for_powerbi.csv', index=False)
