In [37]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
import io
import os
import zipfile

In [38]:
# Install the Kaggle API library and its dependencies.
!pip install kaggle --upgrade --quiet

In [None]:
# Upload your kaggle.json file
print("Please upload your kaggle.json API file.")
files.upload()

In [40]:
# Set up the Kaggle API client
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [41]:
DATASET_API_COMMAND = "kaggle datasets download gauravtopre/bank-customer-churn-dataset"
# The name of the zip file to expect.
ZIP_FILE_NAME = "bank-customer-churn-dataset.zip"
# The directory to extract the files into.
EXTRACTION_DIR = "./bank-customer-dataset"

In [None]:
print("\nDownloading dataset...")
!$DATASET_API_COMMAND

In [None]:
#Unzip the dataset into the specified directory
if os.path.exists(ZIP_FILE_NAME):
    os.makedirs(EXTRACTION_DIR, exist_ok=True)
    with zipfile.ZipFile(ZIP_FILE_NAME, 'r') as zip_ref:
        zip_ref.extractall(EXTRACTION_DIR)
    print(f"Extraction of {ZIP_FILE_NAME} completed to {EXTRACTION_DIR}/")
    os.remove(ZIP_FILE_NAME)
else:
    print(f"Error: {ZIP_FILE_NAME} not found. Please check the dataset API command.")

In [None]:
#Verify the files are extracted
print("\nExtracted files are available at:", EXTRACTION_DIR)
print("Listing contents of the extraction directory:")
!ls -F {EXTRACTION_DIR}

In [45]:
# The EXTRACTION_DIR is already defined and populated by the first cell.
# Define the file paths for each CSV file.
dataset_file_path = os.path.join(EXTRACTION_DIR, 'Bank Customer Churn Prediction.csv')

In [46]:
# Load the datasets into pandas DataFrames.
data = pd.read_csv(dataset_file_path)

In [None]:
#Viewing initial content of the dataset.
data.head()

In [None]:
#plot a correlation heatmap.
numeric_data = data.select_dtypes(include=np.number)
corr = numeric_data.corr()
sns.heatmap(corr)
plt.show()

In [None]:
#boxplot between churn and age shows significant correlation.
sns.boxplot(x='churn',y='age',data=data)
plt.show()

In [None]:
#boxplot between churn and balance.
sns.boxplot(x='churn',y='balance',data=data)

In [None]:
#boxplot between churn and credit score.
sns.boxplot(x='churn',y='credit_score',data=data)

In [None]:
#scatterplot between churn and estimated salary.
plt.scatter(data['balance'],data['estimated_salary'])

In [None]:
#one hot encoding for categorical features (gender).
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['encoded'] = encoder.fit_transform(data['gender'])
data[['churn','encoded']].head()

In [None]:
data['gender'].value_counts()

In [None]:
data['encoded'].value_counts()

In [None]:
#count customer churn between male and female customers.
M_churn,F_churn = 0,0
for i in range(0,len(data)):
  if data['churn'][i]==1 and data['encoded'][i]==1:
    M_churn+=1
  elif data['churn'][i]==1 and data['encoded'][i]==0:
    F_churn+=1
print(M_churn,F_churn)

In [None]:
#barplot shows female bank customers have higher churn rate.
plt.bar(['M','F'],[M_churn,F_churn])

In [58]:
y = data['churn']
x = data[['age','credit_score','encoded','tenure','balance','products_number','credit_card','active_member','estimated_salary']]

In [59]:
#split dataset for model training and testing.
from sklearn.model_selection import train_test_split
x_tr,x_ts,y_tr,y_ts = train_test_split(x,y,test_size=0.2, random_state=42)

In [None]:
#standardise numerical features.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_tr)

In [None]:
#Model selection and training.
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1/0.01, solver='liblinear')
model.fit(x_tr,y_tr)
score = model.score(x_ts,y_ts)
print(f"model score is: {score:.4f}")

In [None]:
#model coefficient tells the weights of features.
#[['age','credit_score','encoded','tenure','balance','products_number','credit_card','active_member','estimated_salary']]
model.coef_

In [76]:
#Pipeline for calculating the optimal value of lambda.
from sklearn.pipeline import make_pipeline
test_score = []
for lam in np.arange(0.01,100,0.1):
  pipe = make_pipeline(StandardScaler(), LogisticRegression(C=1/lam, solver='liblinear'))
  pipe.fit(x_tr,y_tr)
  scores = pipe.score(x_ts,y_ts)
  test_score.append(scores)

In [None]:
#plot the test score valures with different lambda values
plt.plot(test_score)

In [None]:
#value of lambda for highest model score.
value = np.argmax(test_score)
print(f"model value is: {value:.4f}")