# Kaggle Competition



In [4]:
import pandas as pd

# Read Datasets

In [5]:
path = "/kaggle/Bank_Churn/train.csv"
path2 = "/kaggle/Bank_Churn/test.csv"

In [6]:
train_df = pd.read_csv(path)
test_df = pd.read_csv(path2)

In [7]:
train_df.columns

Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [8]:
test_df.columns

Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary'],
      dtype='object')

Check if any of the columns have null values

In [9]:
train_df.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [10]:
# Find duplicate records
duplicate_records = train_df.duplicated()
print("Duplicate records:")
print(train_df[duplicate_records])

# Find NA records
na_records = train_df.isna().any(axis=1)
print("NA records:")
print(train_df[na_records])

Duplicate records:
Empty DataFrame
Columns: [id, CustomerId, Surname, CreditScore, Geography, Gender, Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary, Exited]
Index: []
NA records:
Empty DataFrame
Columns: [id, CustomerId, Surname, CreditScore, Geography, Gender, Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary, Exited]
Index: []


Drop the unwanted columns that doesn't add any value to the final model

In [11]:
columns_to_drop = ['CustomerId', 'Surname']
train_df = train_df.drop(columns=columns_to_drop)
test_df = test_df.drop(columns=columns_to_drop)

Since we have categorical columns like `Geography` and `Gender`, we need to convert them to a number,
use the `OneHotEncoder`.

In [12]:
from sklearn.preprocessing import OneHotEncoder

# Create an instance of LabelEncoder
ohe = OneHotEncoder()
categorical_columns = train_df.select_dtypes(include=['object']).columns.tolist()  # Geography, Gender

train_df = pd.get_dummies(train_df, columns=categorical_columns)
test_df = pd.get_dummies(test_df, columns=categorical_columns)

print("Train DF")
print(train_df.head())

print("\n")
print("Test DF")
print(test_df.head())

Train DF
   id  CreditScore   Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0   0          668  33.0       3       0.00              2        1.0   
1   1          627  33.0       1       0.00              2        1.0   
2   2          678  40.0      10       0.00              2        1.0   
3   3          581  34.0       2  148882.54              1        1.0   
4   4          716  33.0       5       0.00              2        1.0   

   IsActiveMember  EstimatedSalary  Exited  Geography_France  \
0             0.0        181449.97       0              True   
1             1.0         49503.50       0              True   
2             0.0        184866.69       0              True   
3             1.0         84560.88       0              True   
4             1.0         15068.83       0             False   

   Geography_Germany  Geography_Spain  Gender_Female  Gender_Male  
0              False            False          False         True  
1              False           

Few columns have varying/extreme data like too high or too low values like the Balance etc, to avoid steering our model into those direction, we need to normalize the data.

In [13]:
from sklearn.preprocessing import MinMaxScaler

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Normalize the given columns
columns_to_normalize = ['EstimatedSalary', 'Balance', 'Age', 'CreditScore', 'Tenure', 'NumOfProducts']
train_df[columns_to_normalize] = scaler.fit_transform(train_df[columns_to_normalize])
test_df[columns_to_normalize] = scaler.fit_transform(test_df[columns_to_normalize])

# Print the updated dataframe
print("Train DF")
print(train_df.head())

print("\n")
print("Test DF")
print(test_df.head())

Train DF
   id  CreditScore       Age  Tenure   Balance  NumOfProducts  HasCrCard  \
0   0        0.636  0.202703     0.3  0.000000       0.333333        1.0   
1   1        0.554  0.202703     0.1  0.000000       0.333333        1.0   
2   2        0.656  0.297297     1.0  0.000000       0.333333        1.0   
3   3        0.462  0.216216     0.2  0.593398       0.000000        1.0   
4   4        0.732  0.202703     0.5  0.000000       0.333333        1.0   

   IsActiveMember  EstimatedSalary  Exited  Geography_France  \
0             0.0         0.907279       0              True   
1             1.0         0.247483       0              True   
2             0.0         0.924364       0              True   
3             1.0         0.422787       0              True   
4             1.0         0.075293       0             False   

   Geography_Germany  Geography_Spain  Gender_Female  Gender_Male  
0              False            False          False         True  
1            

There are many Binary Classifiers available like Random Forest, Decision Tree etc, the following Gradient Boosting Classifier was selected after trail and error as it was leading to better predictions

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

y = train_df["Exited"]
X = train_df.drop(columns=["Exited"])

# Load your tabular data and split it into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Find the ideal learning rate
# learn_rate = [0.5, 0.25, 0.21, 0.1, 0.05]
# result = []
# for lr in learn_rate:
#     clf = GradientBoostingClassifier(learning_rate = lr) 
#     clf.fit(X_train, y_train)
#     y_pred = clf.predict_proba(X_test)
#     y_pred = [i[1] for i in y_pred]
#     print(f"For {lr}: Gradient Boosting ROC score = {roc_auc_score(y_test,y_pred)}")
#     result.append(roc_auc_score(y_test,y_pred))

In [15]:
# Based on the above loop, we get better predictions with 0.21 learning_rate.
clf = GradientBoostingClassifier(learning_rate=0.21) 
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
y_pred = [i[1] for i in y_pred]
print(f"Gradient Boosting ROC score = {roc_auc_score(y_test,y_pred)}")

Gradient Boosting ROC score = 0.8911337447871497


In [16]:
y_pred[:10]

[0.12253318659259739,
 0.02214654305396264,
 0.8507372624115774,
 0.7349567281848711,
 0.1555120179437907,
 0.020715142477013726,
 0.13632070098943475,
 0.010383029862619785,
 0.6131931197863975,
 0.36748588166902707]

# Time to execute on the test data

In [17]:
X_test_data = test_df

In [18]:
X_test_data.head()

Unnamed: 0,id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,165034,0.472,0.067568,0.2,0.0,0.333333,0.0,1.0,0.804903,True,False,False,True,False
1,165035,0.666,0.378378,0.2,0.0,0.0,1.0,0.0,0.362723,True,False,False,True,False
2,165036,0.612,0.216216,0.7,0.0,0.333333,1.0,0.0,0.694419,True,False,False,True,False
3,165037,0.662,0.243243,0.8,0.0,0.0,1.0,0.0,0.569654,True,False,False,False,True
4,165038,0.804,0.27027,1.0,0.483318,0.0,1.0,0.0,0.697164,False,True,False,False,True


In [19]:
Y = clf.predict_proba(X_test_data)

In [20]:
Y_pred = [i[1] for i in Y]

In [21]:
submission = pd.read_csv("/kaggle/Bank_Churn/sample_submission.csv")

In [22]:
submission["Exited"] = Y_pred

In [23]:
submission.head()

Unnamed: 0,id,Exited
0,165034,0.134034
1,165035,0.964785
2,165036,0.123307
3,165037,0.554399
4,165038,0.696403


In [24]:
submission.to_csv('submission.csv',index=False)