In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

# Data Analysis

In [None]:
data = pd.read_csv('/kaggle/input/churn-modelling/Churn_Modelling.csv')

In [None]:
len(data)

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data['Exited'].value_counts()

In [None]:
print(data.columns)

#### Features: CreditScore, Geography, Gender, Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary

#### Target: Exited

In [None]:
data.isnull().sum()

# EDA
### Let's look at how the features are related

In [None]:
sns.countplot(x=data['Geography'], data=data)

In [None]:
sns.boxplot(x="IsActiveMember", y="Age", data=data)

In [None]:
sns.boxplot(x="Exited", y="Age", data=data)

In [None]:
sns.countplot(x="NumOfProducts", data=data)

In [None]:
sns.boxplot(x="NumOfProducts", y="EstimatedSalary", data=data)

In [None]:
sns.scatterplot(x="Age", y="Balance", data=data)

In [None]:
sns.countplot(x="HasCrCard", data=data, hue="Geography")

In [None]:
sns.countplot(x="Exited", data=data)

# Data Cleaning

### Separating the categorical and numerical features

In [None]:
cat_data = pd.get_dummies(data, columns=['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Tenure', 'NumOfProducts'])

cat_data = cat_data.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Age', 'Balance', 'EstimatedSalary', 'Exited'])

num_data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname','Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Tenure', 'NumOfProducts', 'Exited'])

In [None]:
cat_data.head()

In [None]:
num_data.head()

In [None]:
X = pd.concat([cat_data, num_data], axis=1)
y = data['Exited']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, data['Exited'], test_size = 0.2, random_state=46)

# Model Training
### Treating RandomForestClassifier() as the base model

In [None]:
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

### Training on XGBoost 

In [None]:
xgb = XGBClassifier(learning_rate=0.1, random_state=0, eval_metric='mlogloss', objective='binary:logistic')
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
param_grid = {'learning_rate': [0.01, 0.02, 0.05, 0.1], 'eval_metric': ['error']}

### Determining the best hyperparameters using GridSearchCV

In [None]:
grid = GridSearchCV(estimator=xgb, param_grid = param_grid, scoring = 'accuracy',cv = 10)
grid.fit(X_train,y_train)

best_accuracy = grid.best_score_
best_param = grid.best_params_

print('Best Accuracy: {:.2f} %'.format(best_accuracy*100))