### Importing the libraries

In [None]:
# Some Important Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

# Evalouation
from sklearn.metrics import classification_report

# Algorithms
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Prevent Warnings
import warnings
warnings.filterwarnings('ignore')

### Load and Prepare Data

In [None]:
df = pd.read_csv('../input/churn-modelling/Churn_Modelling.csv')
df.head()

### EDA

In [None]:
# How many data?
df.shape

In [None]:
# Have a look on data
df.info()

- There is no **Missing-Value** in data.


In [None]:
# Some statistics about data
description = df.describe().loc[['mean', 'min', 'max'], :]
np.round(description, 2)

### Age
- **Mean** age of people is about <span style="color: teal">39</span>

- **The youngest** person is <span style="color: teal">18</span> years old

- **The oldest** person is <span style="color: teal">92</span> years old

### Estimated Salary
- **Mean** estimated salary of people is about <span style="color: teal">$100090</span>

- **Max** estimated salary of people is about <span style="color: teal">$199992</span>

- **Min** estimated salary of people is about <span style="color: teal">$12</span>

### Notes
- 20% of people exited
- 71% of people has credit card

### Visualization

In [None]:
plt.figure(figsize = (16, 9), dpi = 200)
sns.heatmap(df.corr(), cmap = 'mako', annot = True)
plt.show()

In [None]:
# How many male & female
plt.figure(dpi = 90)
sns.histplot(df.Gender, color = 'teal', alpha = 0.6, shrink = 0.9)
plt.show()

In [None]:
# How many active & passive member
plt.figure(dpi = 90)
active = df.IsActiveMember.map({1: 'Active', 0: 'Passive'})
sns.histplot(active, color = 'teal', alpha = 0.6, shrink = 0.9)
plt.show()

- Males are more than Females

In [None]:
# Age Distribution
plt.figure(dpi = 90)
sns.distplot(df.Age, color = 'teal')
plt.show()

- Age is like **Normal** Distibution with **positive skew**

In [None]:
# Salary Distribution
plt.figure(dpi = 90)
sns.distplot(df.EstimatedSalary, color = 'teal')
plt.show()

- Estimated Salary is like **Uniform** Distibution

In [None]:
# How many male & female from each country
plt.figure(dpi = 90)
sns.histplot(df, x = 'Geography', hue = 'Gender', palette = 'mako', alpha = 0.6, shrink = 0.9)
plt.show()

In [None]:
# Age distribution based on country
fig = sns.FacetGrid(df, aspect = 5, hue = 'Geography', palette = 'mako')
fig.map(sns.kdeplot, 'Age', shade = True, alpha = 0.3)
fig.add_legend()
plt.show()

In [None]:
# Age distribution based on country
fig = sns.FacetGrid(df, aspect = 5, hue = 'Geography', palette = 'mako')
fig.map(sns.kdeplot, 'EstimatedSalary', shade = True, alpha = 0.3)
fig.add_legend()
plt.show()

In [None]:
# Swarm Plot
plt.figure(dpi = 90)
sns.scatterplot(df.Age, df.EstimatedSalary, color = 'teal', alpha = 0.3)
plt.show()

- There is no relation betwwen Age & Salary

### Data Preprocessing

In [None]:
def clean(df):
    # Drop irelative features
    result = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)
    
    # One Hot Encoding
    result = pd.get_dummies(result, columns = ['Geography', 'Gender'], drop_first = True)
    
    return result

In [None]:
clean_df = clean(df)
clean_df.head()

### Prepare Data for Machine learning

In [None]:
X = clean_df.drop('Exited', axis = 1)
Y = clean_df['Exited']
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 10)

In [None]:
# Fearure Scaling
stdScale = StandardScaler()
x_train = stdScale.fit_transform(x_train)
x_test = stdScale.transform(x_test)

### Train model

**1. K Nearest Neighbor**

In [None]:
xgb = XGBClassifier(random_state = 10)
xgb.fit(x_train, y_train)

In [None]:
grid = {"C": np.logspace(-3,3,7), "penalty": ["l1", "l2"]}

logReg = LogisticRegression()
gridSearch = GridSearchCV(logReg, grid)
gridSearch.fit(x_train, y_train)

gridSearch.best_params_

In [None]:
logReg = LogisticRegression(C = 0.01, penalty = 'l2')
logReg.fit(x_train, y_train)

### Test the model and show the metrics

**KNN Evaluation**

In [None]:
y_pred = xgb.predict(x_train)
print(classification_report(y_pred, y_train))

In [None]:
y_pred = xgb.predict(x_test)
print(classification_report(y_pred, y_test))

**Logistic Regression Evaluation**

In [None]:
y_pred = logReg.predict(x_train)
print(classification_report(y_pred, y_train))

In [None]:
y_pred = logReg.predict(x_test)
print(classification_report(y_pred, y_test))

**XGBOOST** is better for our data