# Отток клиентов

Нужно спрогнозировать, уйдёт клиент из банка в ближайшее время или нет по предоставленым историческим данным о поведении клиентов и расторжении договоров с банком. 

Нужно довести метрику *F1*-меру до 0.59. 


Источник данных: [https://www.kaggle.com/barelydedicated/bank-customer-churn-modeling](https://www.kaggle.com/barelydedicated/bank-customer-churn-modeling)

## Подготовка данных

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [37]:
pd.options.mode.chained_assignment = None


In [38]:
df = pd.read_csv('/datasets/Churn.csv')

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             9091 non-null float64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


Первое, что доставит нам много неудобств это заголовки с заглавными буквами, приведем все заголовки к нижнему регистру. И заполним пропуски в колонке tenure.

In [40]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,rownumber,customerid,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [41]:
df['tenure'] = df['tenure'].fillna(df['tenure'].median())

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
rownumber          10000 non-null int64
customerid         10000 non-null int64
surname            10000 non-null object
creditscore        10000 non-null int64
geography          10000 non-null object
gender             10000 non-null object
age                10000 non-null int64
tenure             10000 non-null float64
balance            10000 non-null float64
numofproducts      10000 non-null int64
hascrcard          10000 non-null int64
isactivemember     10000 non-null int64
estimatedsalary    10000 non-null float64
exited             10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [43]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rownumber,10000.0,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
customerid,10000.0,15690940.0,71936.186123,15565701.0,15628528.25,15690740.0,15753230.0,15815690.0
creditscore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
tenure,10000.0,4.9979,2.76001,0.0,3.0,5.0,7.0,10.0
balance,10000.0,76485.89,62397.405202,0.0,0.0,97198.54,127644.2,250898.09
numofproducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
hascrcard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
isactivemember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
estimatedsalary,10000.0,100090.2,57510.492818,11.58,51002.11,100193.9,149388.2,199992.48


In [44]:
df.corr()

Unnamed: 0,rownumber,customerid,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
rownumber,1.0,0.004202,0.00584,0.000783,-0.006979,-0.009067,0.007246,0.000599,0.012044,-0.005988,-0.016571
customerid,0.004202,1.0,0.005308,0.009497,-0.020331,-0.012419,0.016972,-0.014025,0.001665,0.015271,-0.006248
creditscore,0.00584,0.005308,1.0,-0.003965,-6e-05,0.006268,0.012238,-0.005458,0.025651,-0.001384,-0.027094
age,0.000783,0.009497,-0.003965,1.0,-0.012606,0.028308,-0.03068,-0.011721,0.085472,-0.007201,0.285323
tenure,-0.006979,-0.020331,-6e-05,-0.012606,1.0,-0.007535,0.011409,0.025979,-0.030681,0.010049,-0.015989
balance,-0.009067,-0.012419,0.006268,0.028308,-0.007535,1.0,-0.30418,-0.014858,-0.010084,0.012797,0.118533
numofproducts,0.007246,0.016972,0.012238,-0.03068,0.011409,-0.30418,1.0,0.003183,0.009612,0.014204,-0.04782
hascrcard,0.000599,-0.014025,-0.005458,-0.011721,0.025979,-0.014858,0.003183,1.0,-0.011866,-0.009933,-0.007138
isactivemember,0.012044,0.001665,0.025651,0.085472,-0.030681,-0.010084,0.009612,-0.011866,1.0,-0.011421,-0.156128
estimatedsalary,-0.005988,0.015271,-0.001384,-0.007201,0.010049,0.012797,0.014204,-0.009933,-0.011421,1.0,0.012097


In [45]:
df['geography'].value_counts()

France     5014
Germany    2509
Spain      2477
Name: geography, dtype: int64

In [46]:
df=df.drop(['surname','rownumber','customerid'],1)
df_ohe = pd.get_dummies(df, drop_first=True)
df_ohe.head()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited,geography_Germany,geography_Spain,gender_Male
0,619,42,2.0,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1.0,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8.0,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1.0,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2.0,125510.82,1,1,1,79084.1,0,0,1,0


Познакомились и загрузили данные для модели. Проверели их взаимосвязь и значения, заменили пропуски вданных.Удалили колонку с фамилией, так она не несет информации об уходе клиента. Можем приступать к задачам модели.Провели прямое кодирование для категориальных колонок.

## Исследование задачи

In [47]:
features = df_ohe.drop('exited', axis=1)
target = df_ohe['exited']

In [48]:
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.4, random_state=12345)
features_test, features_valid, target_test, target_valid = train_test_split(
   features_valid, target_valid, test_size=0.5, random_state=12345)
print('Валидационная выборка',len(features_valid))
print('Тестовая выборка',len(features_test))
print('обучающая выборка',len(features_train))

Валидационная выборка 2000
Тестовая выборка 2000
обучающая выборка 6000


Показали модели признаки и цель обучения. Разбили датасет на обучающую, валидационную и тестовую выборку.

In [49]:
numeric = ['creditscore','age','tenure','balance','numofproducts','estimatedsalary']
scaler = StandardScaler()
scaler.fit(features_train[numeric])
features_train[numeric] = scaler.transform(features_train[numeric])
features_valid[numeric] = scaler.transform(features_valid[numeric])
features_test[numeric] = scaler.transform(features_test[numeric])

У данных разных масштаб, проведем стандратизацию признаков.

In [50]:
model = LogisticRegression(random_state=12345,solver='liblinear')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print(f1_score(target_valid,predicted_valid))

0.27478260869565213


In [51]:
model.score(features_valid,target_valid)

0.7915

In [52]:
model = DecisionTreeClassifier(random_state=12345)
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
print(f1_score(target_valid,predicted_valid))

0.4760736196319018


In [53]:
model.score(features_valid,target_valid)

0.7865

Получення модель не подходит по метрике f1, она слишком малаа, посмотрим насколько построенная модель хуже случайной на метрике roc-auc и перейдем к балансу классов.

In [54]:
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

roc_auc_score(target_valid,probabilities_one_valid)

0.666536995312343

## Борьба с дисбалансом

In [55]:
model = LogisticRegression(random_state=12345,solver='liblinear',class_weight='balanced')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
f1_score(target_valid,predicted_valid)

0.4788245462402766

In [56]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

features_upsampled, target_upsampled = upsample(features_train, target_train, 5)
model = LogisticRegression(random_state=12345, solver='liblinear') 
model.fit(features_upsampled, target_upsampled)
predicted_valid =model.predict(features_valid)
f1_score(target_valid,predicted_valid)

0.4834996162701458

In [57]:
model = DecisionTreeClassifier(random_state=12345,class_weight='balanced')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
f1_score(target_valid,predicted_valid)

0.48284313725490197

In [58]:
features_upsampled, target_upsampled = upsample(features_train, target_train, 10)
model = DecisionTreeClassifier(random_state=12345)
model.fit(features_upsampled, target_upsampled)
predicted_valid =model.predict(features_valid)
f1_score(target_valid,predicted_valid)

0.4944099378881988

In [59]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345)
    
    return features_downsampled, target_downsampled

features_downsampled, target_downsampled = downsample(features_train, target_train, 0.2)
model = LogisticRegression(random_state=12345, solver='liblinear') 
model.fit(features_downsampled, target_downsampled)
predicted_valid =model.predict(features_valid)
f1_score(target_valid, predicted_valid)

0.4820747520976354

In [60]:
features_upsampled, target_upsampled = upsample(features_train, target_train,7)
best_model = None
best_result = 0
for depth in range(1,30):
    model = DecisionTreeClassifier(random_state=12345,max_depth = depth)
    model.fit(features_upsampled, target_upsampled)
    predicted_valid =model.predict(features_valid)
    result = f1_score(target_valid,predicted_valid)
    if result > best_result:
        best_model = model
        best_result = result
        depth_best = depth
print('F1:',best_result)
print('depth',depth_best)

F1: 0.5746388443017656
depth 5


In [61]:
features_upsampled, target_upsampled = upsample(features_train, target_train, 7)
best_model = None
best_result = 0
for est in range(1, 15):
    for depth in range(1,20):
        model = RandomForestClassifier(random_state=12345, n_estimators= est,max_depth = depth)
        model.fit(features_upsampled, target_upsampled)
        predicted_valid =model.predict(features_valid)
        result = f1_score(target_valid,predicted_valid) 
        if result > best_result:
            best_model =  model
            best_result = result
            best = est
            max_depth = depth
model.fit(features_upsampled, target_upsampled)
predicted_valid =model.predict(features_valid)
print('F1:',best_result)
print('depth',depth_best)
print('est',best)

F1: 0.5926748057713651
depth 5
est 13


Подобрав метод борьбы с дисбалансом и гиперпараметры модели удалось достичь требуемого значения метрики F1. 

## Тестирование модели

Првоерим значения метрики на тестовой выборке, подготовленной из датасета на первых шагах работы.

In [62]:
predicted =model.predict(features_test)
print('F1:',f1_score(target_test,predicted))
print('Accuracy',accuracy_score(target_test,predicted))

F1: 0.582010582010582
Accuracy 0.842


In [63]:
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

roc_auc_score(target_valid,probabilities_one_valid)

0.8176985658198301

In [64]:
probabilities_test = model.predict_proba(features_test)
probabilities_one_test = probabilities_test[:, 1]

roc_auc_score(target_test,probabilities_one_test)

0.8073709011063459

Провели тестирование подготовленной модели, получили достточно хорошие показатели метрик.

## Вывод

Провери преварительный анализ данных, подготовили данные к анализу, провели прямое кодирование ,и поиск модели для требуемого значения метрики. Все эти действия позволили спрогнозировать отток дааных клиентов банка на основе представленнего датасета.После построение модели проверили такую метрику как roc-auc, данная метрика показывает, что модель лучше рандомных значений, также значение точности на тестовой выборке показало хороший результат.
