In [184]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,roc_auc_score

In [185]:
train=pd.read_csv("/kaggle/input/mse-1-dataset/train.csv")
test=pd.read_csv("/kaggle/input/mse-1-dataset/test.csv")

In [186]:
train.head(10)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15658852.0,Chidimma,651.0,France,Female,35.0,4.0,132271.3,1.0,1.0,0.0,86587.37,0.0
1,1,15615176.0,Chukwubuikem,642.0,France,Male,35.0,7.0,0.0,2.0,1.0,1.0,83917.49,0.0
2,2,15771543.0,Macleod,662.0,France,Female,42.0,4.0,0.0,3.0,0.0,0.0,52337.97,1.0
3,3,15776824.0,Hs?,659.0,France,Female,38.0,5.0,121702.73,1.0,0.0,0.0,73564.44,0.0
4,4,15676937.0,Ts'ui,584.0,Spain,Male,47.0,7.0,0.0,2.0,1.0,1.0,86619.77,0.0
5,5,15791851.0,Iloerika,685.0,France,Male,29.0,6.0,0.0,2.0,1.0,1.0,184317.74,0.0
6,6,15807909.0,Chukwudi,590.0,France,Male,33.0,8.0,0.0,2.0,0.0,1.0,175324.74,0.0
7,7,15683483.0,Ch'in,713.0,Spain,Male,40.0,4.0,0.0,2.0,0.0,1.0,164822.47,0.0
8,8,15765093.0,Smith,759.0,France,Female,50.0,4.0,0.0,1.0,0.0,1.0,174094.24,1.0
9,9,15654765.0,Palerma,479.0,France,Female,38.0,3.0,0.0,2.0,0.0,1.0,116548.02,0.0


In [187]:
train.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [188]:
test.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [189]:
train=train.drop(columns=['id','CustomerId'])
test=test.drop(columns=['CustomerId'])

In [190]:
test_id=test['id']
test=test.drop(columns=['id'])

In [191]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Surname          15000 non-null  object 
 1   CreditScore      15000 non-null  float64
 2   Geography        15000 non-null  object 
 3   Gender           15000 non-null  object 
 4   Age              15000 non-null  float64
 5   Tenure           15000 non-null  float64
 6   Balance          15000 non-null  float64
 7   NumOfProducts    15000 non-null  float64
 8   HasCrCard        15000 non-null  float64
 9   IsActiveMember   15000 non-null  float64
 10  EstimatedSalary  15000 non-null  float64
 11  Exited           15000 non-null  float64
dtypes: float64(9), object(3)
memory usage: 1.4+ MB


In [192]:
X=train.drop(columns=["Exited"])
y=train["Exited"]

In [193]:
numeric_features=X.select_dtypes(include=["int64","float64"]).columns
categorical_features=X.select_dtypes(include=["object"]).columns

In [194]:
X.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,658.9346,37.808133,5.052467,43350.165993,1.584333,0.784133,0.492267,117663.8
std,72.308524,8.254114,2.780904,59981.452231,0.531012,0.411436,0.499957,46983.7
min,431.0,18.0,0.0,0.0,1.0,0.0,0.0,11.55
25%,603.0,32.0,3.0,0.0,1.0,1.0,0.0,83052.03
50%,662.0,37.0,5.0,0.0,2.0,1.0,0.0,122949.5
75%,709.0,42.0,7.0,110148.5275,2.0,1.0,1.0,155743.5
max,850.0,74.0,10.0,207034.96,4.0,1.0,1.0,1640376.0


In [195]:
numeric_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [196]:
categorical_pipeline=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

In [197]:
preprocessing=ColumnTransformer(transformers=[
    ('num',numeric_pipeline,numeric_features),
    ('cat',categorical_pipeline,categorical_features)
])

In [198]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [199]:
model=GradientBoostingClassifier(
    n_estimators=920,
    learning_rate=0.02,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=0.8,
    random_state=42
)

In [200]:
pipeline=Pipeline(steps=[
    ('preprocessing',preprocessing),
    ('model',model)
])

In [201]:
pipeline.fit(X_train,y_train)

In [202]:
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

In [203]:
roc_auc = roc_auc_score(y_test, y_pred_proba)

In [204]:
print("ROC-AUC Score:", roc_auc)

ROC-AUC Score: 0.9378369926976932


In [206]:
y_final=pipeline.predict_proba(test)[:,1]

In [207]:
print(y_final)

[0.01076926 0.43620002 0.93039284 ... 0.01105927 0.01474306 0.01489591]


In [208]:
submission=pd.DataFrame({
    "id":test_id,
    "Exited":y_final
})
submission.to_csv("submission1.csv",index=False)