# Problem: 

**Şirketi terk edecek müşterileri tahmin edebilecek bir makine öğrenmesi modeli geliştirebilir misiniz?**

- Amaç bir bankanın müşterilerinin bankayı terk etme ya da terk etmeme durumunun tahmin edilmesidir.

- Müşteri terkini tanımlayan olay müşterinin banka hesabını kapatmasıdır.

**Veri Seti Hikayesi:**

- 10000 gözlemden ve 12 değişkenden oluşmaktadır. 
- Bağımsız değişkenler müşterilere ilişkin bilgiler barındırmaktadır.
- Bağımlı değişken müşteri terk durumunu ifade etmektedir.

**Değişkenler:**

- Surname : Soy isim
- CreditScore : Kredi skoru
- Geography : Ülke (Germany/France/Spain)
- Gender : Cinsiyet (Female/Male)
- Age : Yaş
- Tenure : Kaç yıllık müşteri
- Balance : Bakiye
- NumOfProducts : Kullanılan banka ürünü
- HasCrCard : Kredi kartı durumu (0=No,1=Yes)
- IsActiveMember : Aktif üyelik durumu (0=No,1=Yes)
- EstimatedSalary : Tahmini maaş
- Exited : Terk mi değil mi? (0=No,1=Yes)


#### Kütüphane Import İşlemleri ve Kütüphane Ayarları

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.base import clone

from sklearn.linear_model import LogisticRegression  
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 
warnings.simplefilter(action = "ignore") 

%config InlineBackend.figure_format = 'retina'


# to display all columns and rows:
pd.set_option('display.max_columns', None); pd.set_option('display.max_rows', None);
pd.set_option('display.float_format', lambda x: '%.4f' % x)


#### Metod Tanımlamaları

In [None]:
def read_data() :
    df = pd.read_csv("../input/churn-predictions-personal/Churn_Predictions.csv")
    df = df.drop(["RowNumber","Surname","CustomerId"], axis = 1)
    return df

def convert_bool(df,cols):
    for c in cols:
        df[c] = df[c].astype('bool')
    return df

def cat_countplot(fig,axarr,boyut,cols,hue):    
    for i in boyut :
        for j in boyut :
            for c in cols :
                sns.countplot(x = c, hue = hue, data = df, ax=axarr[i][j])
                cols.remove(c)
                break
                
def churn_countplot():
    fig, axarr = plt.subplots(2, 2, figsize=(20, 12))
    boyut = [0,1]
    cols = ["Geography","Gender","HasCrCard","IsActiveMember"]
    cat_countplot(fig,axarr,boyut,cols,"Exited")
    
def num_boxplot(fig,axarr,boyut,cols,hue):
    for i in boyut :
        for j in boyut :
            for c in cols :
                if j != 2 :
                    sns.boxplot(y=c, x = hue, hue = hue, data = df, ax=axarr[i][j])
                    cols.remove(c)
                    break

def churn_boxplot():
    fig, axarr = plt.subplots(3, 2, figsize=(20, 12))
    boyut = [0,1,2]
    cols = ["CreditScore","Age","Tenure","Balance","NumOfProducts","EstimatedSalary"]
    num_boxplot(fig,axarr,boyut,cols,"Exited")

def feature_engineering(df):
    df['BalanceBySalary'] = df.Balance/df.EstimatedSalary
    df['BalanceByTenure'] = df.Balance/(df.Tenure + 0.01)
    df['TenureByAge'] = df.Tenure/df.Age
    df['CreditScoreByAge'] = df.CreditScore/df.Age
    df['CreditScoreByTenure'] = df.CreditScore/(df.Tenure + 0.01)
    return df

def get_catvar(df) :
    cat_col = [col for col in df.columns if ( (df[col].dtype == 'object') | (df[col].dtype == 'bool'))]
    kat_df = pd.DataFrame(df[cat_col], index = df[cat_col].index)
    df = df.drop(cat_col, axis = 1)
    return df, kat_df, cat_col

def get_numvar(cat_col,df):
    num_col = [c for c in df.columns if c not in cat_col]
    num_df = pd.DataFrame(df[num_col], index = df[num_col].index)
    df = df.drop(num_col, axis = 1)
    return df, num_df, num_col

def data_encoding(kat_df,columns) :
    kat_df=pd.get_dummies(kat_df,columns = columns, drop_first = True)
    return kat_df

def one_hot_encoding(kat_df):
    columns = ['Gender', 'Geography']
    kat_df = data_encoding(kat_df, columns)
    return kat_df

def get_outlier_col(num_df) :
    outlier_col = []
    for feature in num_df:

        Q1 = num_df[feature].quantile(0.05)
        Q3 = num_df[feature].quantile(0.95)
        IQR = Q3-Q1
        lower = Q1- 1.5*IQR
        upper = Q3 + 1.5*IQR

        if num_df[(num_df[feature] > upper)].any(axis=None):
            outlier_col.append(feature)
            
    return outlier_col

def outlier_boxplot(fig,axarr,boyut,num_df,aykiri_cols) :
    for i in boyut :
        for c in aykiri_cols :
            sns.boxplot(x = num_df[c], ax = axarr[i])
            aykiri_cols.remove(c)
            break
            
def draw_outlier_boxplot(num_df,aykiri_cols):
    fig, axarr = plt.subplots(3, 1, figsize=(10, 15))
    boyut = [0,1,2]
    outlier_boxplot(fig,axarr,boyut,num_df,aykiri_cols)
    
def handle_outlier(outlier_cols,num_df) : 
    for c in outlier_cols :
        Q1 = num_df[c].quantile(0.05)
        Q3 = num_df[c].quantile(0.95)
        IQR = Q3-Q1
        upper = Q3 + 1.5*IQR
        num_df.loc[num_df[c] > upper, c] = upper
    return num_df

def eliminate_outlier_via_lof(num_df):
    clf = LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)
    clf.fit_predict(num_df)
    df_scores = clf.negative_outlier_factor_
    esik_deger = np.sort(df_scores)[11]
    aykiri_olmayan = df_scores>esik_deger
    aykirilar = num_df[~aykiri_olmayan]
    num_df = num_df[aykiri_olmayan]
    return aykirilar.index, num_df

def standardization(num_df):
    X = num_df
    Rscaler = RobustScaler().fit(X)
    scaled_cols=Rscaler.transform(X)
    scaled_cols=pd.DataFrame(scaled_cols, columns=X.columns, index = X.index)
    num_df = scaled_cols
    return num_df

def show_classification_ratio(gecici_df):
    print(gecici_df["Exited"].value_counts()*100/len(gecici_df))
    print(sns.countplot(x = 'Exited', data = gecici_df))
    
def handle_imbalanced_data(gecici_df):
    y = gecici_df["Exited"]
    gecici_df = gecici_df.drop("Exited", axis = 1)
    X = gecici_df
    oversample = SMOTE(random_state = 23456)
    X, y = oversample.fit_resample(X, y)
    return X,y

def create_model_object(models):
    lr  = LogisticRegression(random_state = 12345)
    knn = KNeighborsClassifier()
    svm = SVC(gamma='auto',random_state = 12345)
    cart = DecisionTreeClassifier(random_state = 12345)
    rf = RandomForestClassifier(random_state = 12345)
    lgbm = LGBMClassifier(random_state = 12345)
    xgbm = XGBClassifier(random_state = 12345)
    models.append(('LR', lr))
    models.append(('KNN', knn))
    models.append(('SVM', svm))
    models.append(('CART', cart))
    models.append(('RF', rf))
    models.append(("LGBM", lgbm))
    models.append(('XGBM', xgbm))
    return models, lr, knn, svm, cart, rf, lgbm, xgbm

def calculate_base_cv_acc(models,X_train,y_train,X_test,y_test,sonuc_ilkel_cv,sonuc_ilkel_acc):
    for name, model in models:
        kfold = KFold(n_splits = 10, random_state = 12345)
        cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring= "accuracy")
        sonuc_ilkel_cv.append((name,cv_results.mean(),cv_results.std()))
    sonuc_ilkel_cv_df = pd.DataFrame(sonuc_ilkel_cv, columns = ["Model Ismi", "Ilkel CV Skor","Ilkel CV SSapma"])
    sonuc_ilkel_cv_df = sonuc_ilkel_cv_df.set_index("Model Ismi")
    sonuc_ilkel_cv_df.sort_values('Ilkel CV Skor', ascending=True, inplace = True)
    
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        sonuc_ilkel_acc.append((name,acc))    
    sonuc_ilkel_acc_df = pd.DataFrame(sonuc_ilkel_acc, columns = ["Model Ismi", "Ilkel ACC Skor"])
    sonuc_ilkel_acc_df = sonuc_ilkel_acc_df.set_index("Model Ismi")
    sonuc_ilkel_acc_df.sort_values('Ilkel ACC Skor', ascending=True, inplace = True)
        
    sonuc_ilkel = pd.concat([sonuc_ilkel_cv_df,sonuc_ilkel_acc_df], axis = 1)
    
    return sonuc_ilkel

def feat_imp(fig,axarr,boyut,fi_models,X):
    for i in boyut:
        feature_imp = pd.Series(fi_models[i][1].feature_importances_,index=X.columns).sort_values(ascending=False)
        ax = sns.barplot(x=feature_imp, y=feature_imp.index, ax = axarr[i])
        ax.set_xlabel('Significance Score Of Variables')
        ax.set_ylabel('Variables')
        ax.set_title(fi_models_base[i][0] + " Variable Severity Levels")    
            
def draw_feat_imp(fi_models,X):
    fig, axarr = plt.subplots(4, 1, figsize=(10, 25))
    boyut = [0,1,2,3]
    feat_imp(fig,axarr,boyut,fi_models,X)
    
def compML_tuned(params, model, alg, model_ismi, X_train, y_train, X_test, y_test):
    kfold = KFold(n_splits = 10, random_state = 12345)
    cv_model = GridSearchCV(model,params, cv = kfold, n_jobs = -1, verbose = 2).fit(X_train,y_train)
    model_tuned = alg(**cv_model.best_params_).fit(X_train,y_train)
    cv_tuned = cross_val_score(model_tuned, X_train, y_train, cv = kfold).mean()
    print(model_ismi + " CV Tuned: " + str(cv_tuned))
    sonuc_tuned_cv.append((model_ismi,cv_model.best_params_,cv_tuned))
    
    y_pred = model_tuned.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    sonuc_tuned_acc.append((model_ismi,acc))
    print(model_ismi + " ACC: " + str(acc))
    return model_tuned, y_pred

def define_params():
    lr_params = {'C': [0.1,0.5,1,10,50,100], 
                 'max_iter': [250], 
                 'fit_intercept':[True],
                 'intercept_scaling':[1],
                 'penalty':['l2'],
                 'tol':[0.00001,0.0001,0.000001]}

    knn_params = {'n_neighbors': np.arange(1,50,1)}

    svm_params = {'C': [0.5,100,150], 
                  'gamma': [0.1,0.01,0.001],
                  'probability':[True],
                  'kernel': ['rbf']}

    cart_params = {"max_depth": [2,3,4,5,10,20, 100, 1000,None],
                  "min_samples_split": [2,10,5,30,50,10]}

    rf_params = {'max_depth': [3, 5, 6, 7, 8], 
                 'max_features': [2,4,6,7,8,9],
                 'n_estimators':[50,100],
                 'min_samples_split': [3, 5, 6, 7]}

    lgbm_params = {'max_depth': [5,6,7,8], 
                   'gamma': [0.01,0.001,0.001],
                   'min_child_weight':[1,5,10], 
                   'learning_rate': [0.05,0.1, 0.2, 0.3], 
                   'n_estimators':[5,10,20,100]}

    xgbm_params = {'max_depth': [5,6,7,8], 
                   'gamma': [0.01,0.001,0.001],
                   'min_child_weight':[1,5,10], 
                   'learning_rate': [0.05,0.1, 0.2, 0.3], 
                   'n_estimators':[5,10,20,100]}

    return lr_params, knn_params, svm_params, cart_params, rf_params, lgbm_params, xgbm_params

def calculate_tuned_cv_acc(sonuc_tuned_cv, sonuc_tuned_acc):
    sonuc_tuned_cv_df = pd.DataFrame(sonuc_tuned_cv, columns = ["Model Ismi","Best Params", "Tune Edilmis CV Skor"])
    sonuc_tuned_cv_df = sonuc_tuned_cv_df.set_index("Model Ismi")
    sonuc_tuned_cv_df.sort_values('Tune Edilmis CV Skor', ascending=True, inplace = True)
    
    sonuc_tuned_acc_df = pd.DataFrame(sonuc_tuned_acc, columns = ["Model Ismi","Tune Edilmis ACC Skor"])
    sonuc_tuned_acc_df = sonuc_tuned_acc_df.set_index("Model Ismi")
    sonuc_tuned_acc_df.sort_values('Tune Edilmis ACC Skor', ascending=True, inplace = True)
    
    sonuc_tuned = pd.concat([sonuc_tuned_cv_df,sonuc_tuned_acc_df], axis = 1)
    
    return sonuc_tuned

def calculate_result(sonuc_ilkel,sonuc_tuned):
    sonuc = pd.concat([sonuc_ilkel,sonuc_tuned], axis = 1)
    return sonuc

#### 1). Veriyi Okuma

In [None]:
df = read_data()
df_copy = df.copy()

#### 2). Keşifçi Veri Analizi

In [None]:
#veri setinin yapısal bilgileri
df.info()

In [None]:
#veri setindeki değişkenlerin bilgisi
df.dtypes

In [None]:
#NumOfProducts için değerler neler?
df['NumOfProducts'].unique().tolist()

In [None]:
#Betimsel İstatistikler
df.describe([0.10,0.25,0.50,0.75,0.90,0.95,0.99]).T

In [None]:
df.corr()

### Tip Dönüşümleri

#### Bool tipine dönüşmesi gereken değişkenler : 
- HasCrCard 
- IsActiveMember
- Exited

In [None]:
#Tip Dönüşümleri
cols = ["HasCrCard","IsActiveMember","Exited"]
df = convert_bool(df,cols)

In [None]:
#Tip Dönüşümü başarılı olmuş mu? Olmuş.
df.dtypes

In [None]:
#Bool tipli değişkenlerin gelmemesini bekliyorum.
df.describe([0.10,0.25,0.50,0.75,0.90,0.95,0.99]).T

In [None]:
#veri setinde boyut bilgisi
df.shape

In [None]:
#veri setinde hangi değişkenden kaç tane eksik gözlem var?
df.isnull().sum()

In [None]:
#Her bir değişken için eşsiz gözlem sayısı
df.nunique()

#### Veri Görselleştirme

In [None]:
#Belirleyici kategorik değişkenlere göre churn olma durumunu countplot ile alacağım.
churn_countplot()

In [None]:
#Sürekli değişkenlere göre churn olma durumunu boxplot ile alacağım.
churn_boxplot()

#### 3). Değişken Mühendisliği (Feature Engineering)

In [None]:
df = feature_engineering(df)
df.head()

#### 4). Veri Ön İşleme

In [None]:
#Eksik Değer
df.isnull().sum()

In [None]:
#Değişken Tipleri
df.dtypes

In [None]:
#Kategorik Değişkenler
df, kat_df, cat_col = get_catvar(df)

#Numerik Değişkenler
df, num_df, num_col = get_numvar(cat_col,df)

In [None]:
df.head()

In [None]:
kat_df.head()

In [None]:
cat_col

In [None]:
num_df.head()

In [None]:
num_col

In [None]:
#One Hot Encoding
kat_df = one_hot_encoding(kat_df)

In [None]:
kat_df.dtypes

In [None]:
#Tip Dönüşümleri
cols = ["Gender_Male","Geography_Germany","Geography_Spain"]
kat_df = convert_bool(kat_df,cols)

In [None]:
kat_df.dtypes

In [None]:
kat_df.head()

#### Aykırı Gözlem

#### a). Klasik Yöntemle Aykırı Gözlem Analizi

In [None]:
aykiri_cols = get_outlier_col(num_df)
outlier_cols = aykiri_cols.copy()
draw_outlier_boxplot(num_df,aykiri_cols)

In [None]:
num_df = handle_outlier(outlier_cols,num_df)
draw_outlier_boxplot(num_df,outlier_cols)

#### b). LOF ile Aykırı Gözlem Analizi

In [None]:
aykiri_index,num_df = eliminate_outlier_via_lof(num_df)

# Kategorik değişkenlerden, indexi aykırı olanları uçuruyorum
kat_df = kat_df.drop(aykiri_index)

#df ten, indexi aykırı olanları uçuruyorum
df = df.drop(aykiri_index)

print(num_df.shape)
print(kat_df.shape)
print(df.shape)

#### Değişken Standardizasyonu

In [None]:
num_df.head()

In [None]:
kat_df.head()

In [None]:
num_df = standardization(num_df)

In [None]:
num_df.head()

In [None]:
kat_df.head()

In [None]:
#Model kurmadan önce numerik ve kategorik değişkenin birleştirilmesi  
gecici_df = pd.concat([num_df,kat_df], axis=1)

In [None]:
gecici_df.head()

#### Dengesiz Veri Problemi İnceleme

In [None]:
#Dengesiz veri problemine bakmasak ne olurdu? Aşağıda incelemek için kopyasını aldım.
gecici_df_copy = gecici_df.copy()

In [None]:
#dengesiz veri problemi var mı? Var.
show_classification_ratio(gecici_df)

In [None]:
#Dengesiz Veri Problemi Çözümü 
X,y = handle_imbalanced_data(gecici_df)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
#dengesiz veri problemi çözülmüş mü? Çözülmüş.
gecici_df = pd.concat([X,y], axis = 1)
show_classification_ratio(gecici_df)

#### 5). Model Kurma

In [None]:
X.dtypes

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=12345)

#### Base Model

In [None]:
models = []
sonuc_ilkel_cv = []
sonuc_ilkel_acc = []
models_tuned = []
sonuc_tuned_cv= []
sonuc_tuned_acc = []
fi_models_base = []
fi_models_tuned = []

In [None]:
#Model Nesneleri Oluştur, İlkel Hata hesapla
models, lr, knn, svm, cart, rf, lgbm, xgbm = create_model_object(models)

#Dengesiz veri problemine bakmasak ne olurdu? Aşağıda incelemek için kopyasını aldım.
xgbm_copy = clone(xgbm)


sonuc_ilkel = calculate_base_cv_acc(models,X_train,y_train,X_test,y_test,sonuc_ilkel_cv, sonuc_ilkel_acc)
sonuc_ilkel

In [None]:
#Değişken Önem Düzeylerini Göster
fi_models_base.append(('LGBM',lgbm))
fi_models_base.append(('CART',cart))
fi_models_base.append(('RF',rf))
fi_models_base.append(('XGBM',xgbm))

draw_feat_imp(fi_models_base,X)

#### Model Tuning

In [None]:
#Model Parametrelerini ayarla
lr_params, knn_params, svm_params, cart_params, rf_params, lgbm_params, xgbm_params = define_params()

#### XGBM

In [None]:
xgbm_tuned, y_pred_xgbm_tuned = compML_tuned(xgbm_params,xgbm, XGBClassifier,'XGBM', X_train, y_train, X_test, y_test)

#### LGBM

In [None]:
lgbm_tuned, y_pred_lgbm_tuned = compML_tuned(lgbm_params,lgbm,LGBMClassifier,'LGBM', X_train, y_train, X_test, y_test)

#### Random Forests

In [None]:
rf_tuned, y_pred_rf_tuned = compML_tuned(rf_params,rf,RandomForestClassifier,'RF', X_train, y_train, X_test, y_test)

#### KNN

In [None]:
knn_tuned, y_pred_knn_tuned = compML_tuned(knn_params,knn,KNeighborsClassifier,'KNN', X_train, y_train, X_test, y_test)

#### CART

In [None]:
cart_tuned, y_pred_cart_tuned = compML_tuned(cart_params,cart,DecisionTreeClassifier,'CART', X_train, y_train, X_test, y_test)

#### SVM

In [None]:
svm_tuned, y_pred_svm_tuned = compML_tuned(svm_params,svm,SVC,'SVM', X_train, y_train, X_test, y_test)

#### Logistic Regression

In [None]:
lr_tuned, y_pred_lr_tuned = compML_tuned(lr_params,lr,LogisticRegression,'LR', X_train, y_train, X_test, y_test)

In [None]:
#Değişken Önem Düzeylerini Göster
fi_models_tuned.append(('LGBM',lgbm_tuned))
fi_models_tuned.append(('CART',cart_tuned))
fi_models_tuned.append(('RF',rf_tuned))
fi_models_tuned.append(('XGBM',xgbm_tuned))

draw_feat_imp(fi_models_tuned,X)

In [None]:
sonuc_tuned = calculate_tuned_cv_acc(sonuc_tuned_cv, sonuc_tuned_acc)
sonuc_tuned

In [None]:
sonuc = calculate_result(sonuc_ilkel,sonuc_tuned)
sonuc

In [None]:
sonuc["Best Params"]["XGBM"]

#### Dengesiz Veri Problemine Bakmasaydık XGBM için Confusion Matrix Nasıl Değişirdi?

In [None]:
y_copy = gecici_df_copy["Exited"]
X_copy = gecici_df_copy.drop("Exited", axis = 1)

In [None]:
X_train_copy, X_test_copy, y_train_copy, y_test_copy = train_test_split(X_copy, y_copy, test_size=0.20, random_state = 12345)

In [None]:
#Model Nesnesi Oluştur, İlkel Hata hesapla. 
#XGBM_COPY isimli yeni model geldi. Bu model, veri setine SMOTE uygulanmadan kuruldu. 
#İlkel ACC Skor XGBM de 0.9051 iken, XGBM_COPY de 0.8609 a düştü.
models_copy = []
sonuc_ilkel_cv_copy = []
sonuc_ilkel_acc_copy = []
models_copy.append(('XGBM_COPY', xgbm_copy))
sonuc_ilkel_copy = calculate_base_cv_acc(models_copy,X_train_copy,y_train_copy,X_test_copy,y_test_copy,sonuc_ilkel_cv_copy,sonuc_ilkel_acc_copy)
sonuc_ilkel_copy

In [None]:
#Model Tuning
xgbm_tuned_copy, y_pred_xgbm_tuned_copy = compML_tuned(xgbm_params,xgbm_copy, XGBClassifier,'XGBM_COPY', X_train_copy, y_train_copy, X_test_copy, y_test_copy)

In [None]:
#Dengesiz veri problemine bakılmazsa confusion matrix
conf_xgbm_copy = confusion_matrix(y_test_copy, y_pred_xgbm_tuned_copy)
conf_xgbm_copy

In [None]:
#Dengesiz veri için SMOTE yöntemi kullanıldıktan sonra confusion matrix
conf_xgbm = confusion_matrix(y_test,y_pred_xgbm_tuned)
conf_xgbm