# Отток клиентов

# 1. Подготовка данных

In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle


data = pd.read_csv('/datasets/Churn.csv')

print(data.info())
data['Tenure'] = data['Tenure'].fillna(value=0) # меняем пропуски на 0

# дропнем фамилию, индекс и айди - не нужны, нельзя рассчитать уход по ним
data = data.drop(['Surname', 'RowNumber', 'CustomerId'], axis = 1)
# gender & geography переведем к числ.знач. прямым кодированием
data = pd.get_dummies(data, drop_first=True)

print(data.info())



#делим на выборки
data_valid_train, data_test = train_test_split(data, test_size=0.2, random_state=12345)
data_train, data_valid = train_test_split(data_valid_train, test_size=0.25, random_state=12345)



# признаки, целевые признаки 
features_train = data_train.drop(['Exited'], axis=1)
target_train = data_train['Exited']
features_test = data_test.drop(['Exited'], axis=1)
target_test = data_test['Exited']
features_valid = data_valid.drop(['Exited'], axis=1)
target_valid = data_valid['Exited']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             9091 non-null float64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
CreditScore          10000 non-null int64
Age                  10000 non-null int64
Tenure               10000 non-null float64
Balance              

Пропуски в столбце Tenure меняем на 0, тк при отсутствии информации считаем, что Tenure=0.
Столбцы с фамилией, индексом и айди убираем, по ним нельзя определить, уйдет клиент или нет.
Столбцы Gender, Geography приведены к численным значениям с помощью прямого кодирования.
Убраны лишние столбцы,  чтобы избежать дамми-ловушек.
Полученные данные разбиты на выборки в пропорции 6:2:2.


# 2. Исследование задачи

In [3]:

model11 = DecisionTreeClassifier(random_state=12345)
model11.fit(features_train, target_train)
train_predictions11 = model11.predict(features_valid)
probabilities_valid11 = model11.predict_proba(features_valid)
probabilities_one_valid11 = probabilities_valid11[:, 1]
print(confusion_matrix(target_valid, train_predictions11))
print()


[[1384  225]
 [ 196  195]]



Для исследования была взята модель случайного дерева с гиперпараметрами по умолчанию. Уровень ложноположительных\отрицательных находится примерно такой же как уровень истиннно положительных. необходимо сбалансировать классы.


# 3. Борьба с дисбалансом

In [5]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

features_upsampled, target_upsampled = upsample(features_train, target_train, 10)

model1u = DecisionTreeClassifier(random_state=12345)
model1u.fit(features_upsampled, target_upsampled)
predicted_valid1u = model1u.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid1u))

model2u = RandomForestClassifier(random_state=12345)
model2u.fit(features_upsampled, target_upsampled)
predicted_valid2u = model2u.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid2u))


model3u=LogisticRegression(random_state=12345, solver='liblinear')
model3u.fit(features_upsampled, target_upsampled)
predicted_valid3u = model3u.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid3u))

F1: 0.4704336399474376




F1: 0.5492957746478873
F1: 0.3527239150507849


Значения f1 при применении upsample для моделей с гиперпараметрами по умолчанию.

In [6]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345)
    
    return features_downsampled, target_downsampled

features_downsampled, target_downsampled = downsample(features_train, target_train, 0.1)


model1d = DecisionTreeClassifier(random_state=12345)
model1d.fit(features_downsampled, target_downsampled)
predicted_valid1d = model1d.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid1d))

model2d = RandomForestClassifier(random_state=12345)
model2d.fit(features_downsampled, target_downsampled)
predicted_valid2d = model2d.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid2d))


model3d=LogisticRegression(random_state=12345, solver='liblinear')
model3d.fit(features_downsampled, target_downsampled)
predicted_valid3d = model3d.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid3d))

F1: 0.42985487214927437
F1: 0.46464646464646475
F1: 0.3417890520694259




Значения f1 при применении downsample для моделей с гиперпараметрами по умолчанию.

In [7]:
model1b = DecisionTreeClassifier(random_state=12345,class_weight='balanced')
model1b.fit(features_train, target_train)
predicted_valid1b = model1b.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid1b))

model2b = RandomForestClassifier(random_state=12345,class_weight='balanced')
model2b.fit(features_train, target_train)
predicted_valid2b = model2b.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid2b))


model3b=LogisticRegression(random_state=12345, class_weight='balanced')
model3b.fit(features_train, target_train)
predicted_valid3b = model3b.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid3b))

F1: 0.46293888166449937
F1: 0.5354838709677419
F1: 0.4754521963824289




Значения f1 при применении балансировки классов для моделей с гиперпараметрами по умолчанию.

# Вывод: наибольшее значение f1 у сл. леса и метода upsample. Используем их.


In [8]:
#Настраиваем upsample 
for i in range(1, 16):
    features_upsampled, target_upsampled = upsample(features_train, target_train, i)
    model2up = RandomForestClassifier(random_state=12345)
    model2up.fit(features_upsampled, target_upsampled)
    predicted_valid2up = model2up.predict(features_valid)
    print('i=',i, "F1:", f1_score(target_valid, predicted_valid2up))




i= 1 F1: 0.5445859872611465
i= 2 F1: 0.552437223042836




i= 3 F1: 0.5833333333333334




i= 4 F1: 0.5459770114942529




i= 5 F1: 0.559322033898305




i= 6 F1: 0.5542857142857143




i= 7 F1: 0.5438848920863308




i= 8 F1: 0.556497175141243




i= 9 F1: 0.5459770114942529




i= 10 F1: 0.5492957746478873




i= 11 F1: 0.5470085470085471




i= 12 F1: 0.5828571428571429




i= 13 F1: 0.5374449339207048




i= 14 F1: 0.5415472779369628




i= 15 F1: 0.5500705218617771


Максимальное значение f1 достигается, если повторяем процедуру 3 раза

In [9]:
#Настраиваем случайный лес, где repeat = 3

features_upsampled, target_upsampled = upsample(features_train, target_train, 3)

for estimators in range(10, 101, 10):
    for depth in range(1,11,1):
        model22 = RandomForestClassifier(n_estimators=estimators, max_depth=depth, random_state=12345, class_weight='balanced')
        model22.fit(features_upsampled, target_upsampled)
        train_predictions22 = model22.predict(features_valid)
        probabilities_valid22 = model22.predict_proba(features_valid)
        probabilities_one_valid22 = probabilities_valid22[:, 1]
        print("n_estimators =", estimators,"depth =",depth,'f1:',f1_score(target_valid, train_predictions22), 'auc-roc:', roc_auc_score(target_valid,probabilities_one_valid22))
    


n_estimators = 10 depth = 1 f1: 0.4407582938388625 auc-roc: 0.7528822051154075
n_estimators = 10 depth = 2 f1: 0.535031847133758 auc-roc: 0.8075840977621085
n_estimators = 10 depth = 3 f1: 0.5263157894736843 auc-roc: 0.8163105867093506
n_estimators = 10 depth = 4 f1: 0.5633187772925764 auc-roc: 0.8263587651938663
n_estimators = 10 depth = 5 f1: 0.5576923076923077 auc-roc: 0.8366493461491388
n_estimators = 10 depth = 6 f1: 0.5634095634095635 auc-roc: 0.841013385384959
n_estimators = 10 depth = 7 f1: 0.5611814345991561 auc-roc: 0.8404506937479237
n_estimators = 10 depth = 8 f1: 0.5829694323144105 auc-roc: 0.8437910792711714
n_estimators = 10 depth = 9 f1: 0.5711111111111112 auc-roc: 0.8401669636428084
n_estimators = 10 depth = 10 f1: 0.5694444444444444 auc-roc: 0.8330729162527282
n_estimators = 20 depth = 1 f1: 0.5206349206349206 auc-roc: 0.7846520292663233
n_estimators = 20 depth = 2 f1: 0.5113207547169812 auc-roc: 0.7999567649363634
n_estimators = 20 depth = 3 f1: 0.5192127460168696 au

Макс значение при n_estimators = 100 depth = 9 f1: 0.593258426966292 auc-roc: 0.8533059723200223.
Используем для финальной модели.

# 4. Тестирование модели

In [10]:
model = RandomForestClassifier(n_estimators=100,max_depth=9, random_state=12345)
model.fit(features_upsampled, target_upsampled)
test_predictions = model.predict(features_test)
probabilities_valid = model.predict_proba(features_test)
probabilities_one_valid = probabilities_valid[:, 1]
print('f1:',f1_score(target_test, test_predictions), 'auc-roc:', roc_auc_score(target_test,probabilities_one_valid))


f1: 0.6416382252559726 auc-roc: 0.8679919186625594
