<a href="https://www.kaggle.com/code/shruthiiiee/bank-customer-churn-prediction?scriptVersionId=144197321" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Introduction**
In the world of business, keeping customers happy is crucial. We're diving into predictive modeling using machine learning to spot customers who might leave. Our main goal is to create a smart prediction model that can spot customers who might leave. By using advanced machine learning techniques, we want to provide the bank with useful insights to help them reach out to at-risk customers before they churn.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('/kaggle/input/bank-customer-churn/Customer-Churn-Records.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
null=df.isnull().sum()
null

In [None]:
df.tail()

In [None]:
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'],axis=1,inplace=True)

In [None]:
 Countries = df['Geography'].unique() 
 print(Countries)

In [None]:
Types = df['Card Type'].unique()
print(Types)

In [None]:
df['Gender_n'] = df['Gender'].map({'Female': 0, 'Male': 1})
df['Geography_n'] = df['Geography'].map({'France': 0, 'Spain': 1, 'Germany': 2})
df['Card Type_n'] = df['Card Type'].map({'DIAMOND': 0, 'GOLD': 1, 'SILVER': 2, 'PLATINUM': 3})

In [None]:
new_range = 3 
intermediate_range = 7  
long_term_range = 8 
df['TenureGroup'] = pd.cut(df['Tenure'], bins=[0, new_range, intermediate_range, df['Tenure'].max()+1],
                           labels=['New', 'Intermediate', 'Long-term'], right=False)

In [None]:
df['TenureGroup'] = df['TenureGroup'].map({'New':0, 'Intermediate':1, 'Long-term':2})

In [None]:
df.head()

# **Exploratory Data Analysis**

In [None]:
churn_counts = df['Exited'].value_counts()
colors = ['#7B68EE', '#483D8B']
plt.figure(figsize=(8, 6))
plt.bar(churn_counts.index, churn_counts.values, color=colors)
plt.xlabel('Churn (Exited)')
plt.ylabel('Count')
plt.xticks(churn_counts.index, labels=['Not Churned', 'Churned'])
plt.title('Count of Customers Churned vs. Not Churned')
plt.show()

In [None]:
correlation_matrix = df[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
sns.pairplot(df[['CreditScore', 'Age', 'Balance', 'Exited']], hue='Exited', palette='husl')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Exited', y='CreditScore', palette='Set1')
plt.xlabel('Churn (Exited)')
plt.ylabel('CreditScore')
plt.title('CreditScore Distribution by Churn')
plt.xticks([0, 1], ['Not Churned', 'Churned'])
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Exited', y='Tenure', palette='Set1')
plt.xlabel('Churn (Exited)')
plt.ylabel('Tenure')
plt.title('Tenure Distribution by Churn')
plt.xticks([0, 1], ['Not Churned', 'Churned'])
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Exited', y='Age', palette='Set1')
plt.xlabel('Churn (Exited)')
plt.ylabel('Age')
plt.title('Age Distribution by Churn')
plt.xticks([0, 1], ['Not Churned', 'Churned'])
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(data=df, x='Exited', y='NumOfProducts', palette='Set1')
plt.xlabel('Churn (Exited)')
plt.ylabel('Num Of Products')
plt.title('Number of products distribution by Churn')
plt.xticks([0, 1], ['Not Churned', 'Churned'])
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Exited', y='Balance', palette='Set1')
plt.xlabel('Churn (Exited)')
plt.ylabel('Balance')
plt.title('Balance Distribution by Churn')
plt.xticks([0, 1], ['Not Churned', 'Churned'])
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Exited', y='EstimatedSalary', palette='Set1')
plt.xlabel('Churn (Exited)')
plt.ylabel('Estimated Salary')
plt.title('Estimated Salary Distribution by Churn')
plt.xticks([0, 1], ['Not Churned', 'Churned'])
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='IsActiveMember', hue='Exited', palette='Set1')
plt.xlabel('Active Membership')
plt.ylabel('Count')
plt.title('Active Membership Distribution by Churn')
plt.legend(['Not Churned', 'Churned'])
plt.xticks([0, 1], ['Inactive', 'Active'])
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Gender', hue='Exited', palette='Set1')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Gender Distribution by Churn')
plt.legend(['Not Churned', 'Churned'])
plt.xticks([0, 1], ['Female', 'Male'])
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Complain', hue='Exited', palette='Set1')
plt.xlabel('Customer Complaint')
plt.ylabel('Count')
plt.title('Churn by Customer Complaint')
plt.legend(['Not Churned', 'Churned'])
plt.xticks([0, 1], ['No Complaint', 'Complaint'])
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Card Type', hue='Exited', palette='Set1')
plt.xlabel('Card Type')
plt.ylabel('Count')
plt.title('Churn by Card Type')
plt.legend(['Not Churned', 'Churned'])
plt.show()

In [None]:
gender_churn_counts = df.groupby(['Gender', 'Exited']).size().unstack()
for gender in df['Gender'].unique():
    total_customers = gender_churn_counts.loc[gender].sum()
    churn_rate = gender_churn_counts.loc[gender][1] / total_customers
    print(f'Churn Rate - {gender}: {churn_rate:.2%}')


In [None]:
geography_churn_counts = df.groupby(['Geography', 'Exited']).size().unstack()
for country in df['Geography'].unique():
    total_customers = geography_churn_counts.loc[country].sum()
    churn_rate = geography_churn_counts.loc[country][1] / total_customers
    print(f'Churn Rate - {country}: {churn_rate:.2%}')

In [None]:
num_of_products_churn_counts = df.groupby(['NumOfProducts', 'Exited']).size().unstack()
for num_products in df['NumOfProducts'].unique():
    total_customers = num_of_products_churn_counts.loc[num_products].sum()
    churn_rate = num_of_products_churn_counts.loc[num_products][1] / total_customers
    print(f'Churn Rate - {num_products} Products: {churn_rate:.2%}')

In [None]:
credit_score_churn_counts = df.groupby(['Exited'])['CreditScore']
churn_rate = (credit_score_churn_counts.sum() / credit_score_churn_counts.count()).tolist()
print(f'Churn Rate - Not Churned: {churn_rate[0]:.2f}')
print(f'Churn Rate - Churned: {churn_rate[1]:.2f}')

In [None]:
card_type_churn_counts = df.groupby(['Card Type', 'Exited']).size().unstack()
for card_type in df['Card Type'].unique():
    total_customers = card_type_churn_counts.loc[card_type].sum()
    churn_rate = card_type_churn_counts.loc[card_type][1] / total_customers
    print(f'Churn Rate - {card_type}: {churn_rate:.2%}')


In [None]:
balance_churn_counts = df.groupby(['Exited'])['Balance']
churn_rate = (balance_churn_counts.sum() / balance_churn_counts.count()).tolist()
print(f'Churn Rate - Not Churned: {churn_rate[0]:.2f}')
print(f'Churn Rate - Churned: {churn_rate[1]:.2f}')

In [None]:
age_churn_counts = df.groupby(['Exited'])['Age']
churn_rate = (age_churn_counts.sum() / age_churn_counts.count()).tolist()
print(f'Churn Rate - Not Churned: {churn_rate[0]:.2f}')
print(f'Churn Rate - Churned: {churn_rate[1]:.2f}')

In [None]:
complaint_churn_counts = df.groupby(['Complain', 'Exited']).size().unstack()
for complaint in df['Complain'].unique():
    total_customers = complaint_churn_counts.loc[complaint].sum()
    churn_rate = complaint_churn_counts.loc[complaint][1] / total_customers
    print(f'Churn Rate - Complaint: {complaint} - {churn_rate:.2%}')


In [None]:
churned_count = df[df['Exited'] == 1]['Exited'].count()
not_churned_count = df[df['Exited'] == 0]['Exited'].count()

print(f'Number of Customers Churned: {churned_count}')
print(f'Number of Customers Not Churned: {not_churned_count}')

In [None]:
tenure_churn_counts = df.groupby(['Tenure', 'Exited']).size().unstack()
for tenure in df['Tenure'].unique():
    total_customers = tenure_churn_counts.loc[tenure].sum()
    churn_rate = tenure_churn_counts.loc[tenure][1] / total_customers
    print(f'Churn Rate - Tenure {tenure}: {churn_rate:.2%}')

# **Result Of EDA**

## Gender:

The churn rate for female customers is significantly higher (25.07%) compared to male customers (16.47%). This suggests that gender may play a role in customer churn, with female customers being more likely to churn.

## Geography:

The churn rates vary by geography. Germany has the highest churn rate (32.44%), followed by Spain (16.67%) and France (16.17%). This indicates that customers from different countries may have varying tendencies to churn, with German customers having the highest likelihood of churning.

## Number of Products:

The number of products a customer holds is also associated with churn. Customers with 4 products have a 100% churn rate, but this group is relatively small, so the result may not be representative. Customers with 1 product have a relatively high churn rate (27.71%), indicating that those with fewer products are more likely to churn. However, customers with 2 products have a lower churn rate (7.60%).

## Credit Score:

The average credit score is slightly lower for churned customers (645.41) compared to those who did not churn (651.84). While there is a small difference, it may suggest that customers with slightly lower credit scores are more likely to churn.

## Card Type:

Churn rates do not show significant differences among different card types (DIAMOND, GOLD, SILVER, PLATINUM). This implies that the type of card does not strongly influence customer churn in this dataset.

## Tenure:

The churn rates by tenure indicate that customers with a tenure of 0 months have the highest churn rate at 23.00%, while customers with a tenure of 7 months have the lowest churn rate at 17.22%. Overall, there is variation in churn rates across different tenure periods, with shorter tenures generally exhibiting higher churn rates.

## Balance:

Churned customers have a higher average balance (91109.48) compared to those who did not churn (72742.75). This suggests that customers with higher account balances are more likely to churn, which might be counterintuitive.

## Age:

Churned customers have a slightly higher average age (44.84) compared to those who did not churn (37.41). This implies that older customers are slightly more likely to churn, though the difference is not substantial.

## Complaint:

Customers who made a complaint (Complaint: 1) have an extremely high churn rate (99.51%) compared to those with no complaint (Complaint: 0, 0.05%). This indicates that customers who express dissatisfaction through complaints are almost guaranteed to churn.

## Tenure
The churn rates by tenure indicate that customers with a tenure of 0 months have the highest churn rate at 23.00%, while customers with a tenure of 7 months have the lowest churn rate at 17.22%. Overall, there is variation in churn rates across different tenure periods, with shorter tenures generally exhibiting higher churn rates.

# **Building the Churn Prediction Model**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df.drop('Exited', axis=1)
y = df['Exited']
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Complain', 'Satisfaction Score', 'Card Type', 'Point Earned']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'))  # Handle unknown categories
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = RandomForestClassifier(n_estimators=100, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', report)
print('Confusion Matrix:\n', conf_matrix)

# **Conclusion**

* The model achieved very high accuracy on the test data (approximately 100%).
* The classification report indicates high precision, recall, and F1-score for both classes (0 and 1), suggesting that the model is performing exceptionally well.
* The confusion matrix shows that there are very few misclassifications, with only a small number of false positives and false negatives.