# 1. Подготовка данных

In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from  sklearn.utils import shuffle

scaler = StandardScaler()

In [2]:
df = pd.read_csv('/datasets/Churn.csv')
print(df.info())
print(df.describe())
df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
RowNumber          10000 non-null int64
CustomerId         10000 non-null int64
Surname            10000 non-null object
CreditScore        10000 non-null int64
Geography          10000 non-null object
Gender             10000 non-null object
Age                10000 non-null int64
Tenure             9091 non-null float64
Balance            10000 non-null float64
NumOfProducts      10000 non-null int64
HasCrCard          10000 non-null int64
IsActiveMember     10000 non-null int64
EstimatedSalary    10000 non-null float64
Exited             10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB
None
         RowNumber    CustomerId   CreditScore           Age       Tenure  \
count  10000.00000  1.000000e+04  10000.000000  10000.000000  9091.000000   
mean    5000.50000  1.569094e+07    650.528800     38.921800     4.997690   
std     2886.89568  7.1936

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
8573,8574,15794159,Highett,633,France,Female,26,8.0,124281.84,1,1,1,60116.57,0
4850,4851,15593094,Goddard,516,France,Male,27,9.0,0.0,1,1,0,142680.64,1
5899,5900,15793901,Capon,639,France,Female,27,2.0,0.0,2,0,0,125244.18,0
9712,9713,15701768,Tung,637,France,Male,32,3.0,0.0,2,1,1,197827.06,0
2994,2995,15710257,Matveyeva,625,France,Female,39,3.0,130786.92,1,0,1,121316.07,0


Столбец RowNumber явно лишний, уберем его. Фамилия и ID клиента тоже вряд ли будут полезны для модели (можно их оторвать на время работы с моделью и после прогноза присоединять обратно). С категориальными признаками справимся с помощью OHE, т.к. он не налагает ограничений на выбор алгоритма.<br>

Пропуски в столбце Tenure можно сбросить т.к. их всего около 10% или весь столбец, хотя как оценить насколько важен данный признак для прогноза. С какой-то стороны пол для данной задачи кажется совершенно неважным. А вот страна и кол-во недвижимости кажeтся более важными признаками. Лучше обучаться на данных, в которых уверен.<br>

Также потребуется масштабировать признаки.

In [3]:
df = df.drop(index=df[df['Tenure'].isnull()].index)
df_surname = df[['CustomerId', 'Surname']]

features = df.drop(['RowNumber', 'Surname', 'CustomerId', 'Exited'], axis=1)
target = df['Exited']

print(df['Geography'].value_counts())
print(df_surname.info(), '\n')
features.info()

True
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9091 entries, 0 to 9998
Data columns (total 2 columns):
CustomerId    9091 non-null int64
Surname       9091 non-null object
dtypes: int64(1), object(1)
memory usage: 213.1+ KB
None 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9091 entries, 0 to 9998
Data columns (total 10 columns):
CreditScore        9091 non-null int64
Geography          9091 non-null object
Gender             9091 non-null object
Age                9091 non-null int64
Tenure             9091 non-null float64
Balance            9091 non-null float64
NumOfProducts      9091 non-null int64
HasCrCard          9091 non-null int64
IsActiveMember     9091 non-null int64
EstimatedSalary    9091 non-null float64
dtypes: float64(3), int64(5), object(2)
memory usage: 781.3+ KB


In [4]:
features = pd.get_dummies(features, drop_first=True)
features.sample(5)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
7248,605,57,2.0,0.0,3,1,0,66652.75,0,1,0
6001,775,30,5.0,0.0,1,1,0,193880.6,0,0,0
9079,516,27,1.0,0.0,1,0,1,112311.15,0,1,0
9446,544,30,4.0,73218.89,1,0,1,126796.69,0,0,1
4921,526,33,8.0,114634.63,2,1,0,110114.38,0,1,0
5825,607,62,8.0,108004.64,1,1,1,23386.77,0,1,0
5848,758,33,0.0,129142.54,2,1,1,26606.28,1,0,1
7168,507,33,7.0,0.0,2,1,1,85411.01,0,0,1
5012,575,40,5.0,0.0,2,1,1,122488.59,0,0,1
8879,809,43,2.0,0.0,2,1,1,132908.07,0,0,1


In [21]:
X_train_val, X_test, y_train_val, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

need_scale = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
scaler.fit(X_train[need_scale])
X_train[need_scale] = scaler.transform(X_train[need_scale])
X_val[need_scale] = scaler.transform(X_val[need_scale])

#при тестировании обучить scaler на X_train_val вместо X_train

X_train.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.p

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
count,4567.0,4567.0,4567.0,4567.0,4567.0,4567.0,4567.0,4567.0,4567.0,4567.0,4567.0
mean,-3.671734e-16,-3.9673400000000003e-17,1.244656e-16,5.873218000000001e-17,1.29133e-16,0.703963,0.508649,3.422803e-17,0.25312,0.2448,0.549814
std,1.000109,1.000109,1.000109,1.000109,1.000109,0.456557,0.49998,1.000109,0.434847,0.430015,0.497567
min,-3.084232,-2.003354,-1.716588,-1.247115,-0.878395,0.0,0.0,-1.737972,0.0,0.0,0.0
25%,-0.6855383,-0.6679238,-1.027932,-1.247115,-0.878395,0.0,0.0,-0.8417151,0.0,0.0,0.0
50%,0.01752705,-0.1909846,0.00505145,0.3304266,-0.878395,1.0,1.0,-0.01460033,0.0,0.0,1.0
75%,0.6792356,0.4767304,0.6937073,0.8102167,0.8462955,1.0,1.0,0.8507248,1.0,0.0,1.0
max,2.085366,4.673796,1.726691,2.798258,4.295676,1.0,1.0,1.746735,1.0,1.0,1.0


# 2. Исследование задачи

In [12]:
target.value_counts(normalize=True)

0    0.796062
1    0.203938
Name: Exited, dtype: float64

In [74]:
models = [RandomForestClassifier(n_estimators=100, random_state=42), 
          DecisionTreeClassifier(max_depth=3, random_state=42), 
          LogisticRegression(random_state=42, solver='liblinear')]

for model in models:
    model.fit(X_train, y_train)
    pred_val = model.predict(X_val)
    pred_prob = model.predict_proba(X_val)[:, 1]
    print(model)
    print('F1: ', f1_score(y_val, pred_val))
    print('AUC_ROC: ', roc_auc_score(y_val, pred_prob))
    print('accuracy: ', accuracy_score(y_val, pred_val), '\n')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
F1:  0.5555555555555556
AUC_ROC:  0.826543536201238
accuracy:  0.8634274458305975 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')
F1:  0.5269978401727862
AUC

При наблюдаемом дисбалансе классов метрика accuracy не может дать объективную оценку модели. Даже в самом худшем случае она стремится к значению вероятности самого частого класса. Что хоть и означает, что модель неслучайная, но говорит о явном переобучении.

# 3. Борьба с дисбалансом

In [69]:
models = [RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'), 
          DecisionTreeClassifier(max_depth=3, random_state=42, class_weight='balanced'), 
          LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')]

for model in models:
    model.fit(X_train, y_train)
    pred_val = model.predict(X_val)
    pred_prob = model.predict_proba(X_val)[:, 1]
    print(model)
    print('F1: ', f1_score(y_val, pred_val))
    print('AUC_ROC: ', roc_auc_score(y_val, pred_prob), '\n')

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)
F1:  0.5572354211663068
AUC_ROC:  0.8333801405317298 

DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')
F1:  0.4923857868020304
AUC_ROC:  0.77740974

Взвешивание классов очень положительно сказалось на логистической регрессии. Для остальных моделей этот вариант не показал желаемых результатов. Попробуем для них другие варианты устранения дисбаланса.

In [98]:
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(X_train, y_train)
pred_prob = model.predict_proba(X_val)[:, 1]

for threshold in np.arange(0, 0.55, 0.05):
    thres_val = pred_prob > threshold
    print('threshold={:.2f} - f1 - {}'.format(threshold, f1_score(y_val, thres_val)))

threshold=0.00 - f1 - 0.32545354590434306
threshold=0.05 - f1 - 0.35294117647058826
threshold=0.10 - f1 - 0.38990825688073394
threshold=0.15 - f1 - 0.4410058027079304
threshold=0.20 - f1 - 0.46722288438617404
threshold=0.25 - f1 - 0.47954866008462627
threshold=0.30 - f1 - 0.4713584288052373
threshold=0.35 - f1 - 0.4396396396396396
threshold=0.40 - f1 - 0.3864541832669323
threshold=0.45 - f1 - 0.34070796460176994
threshold=0.50 - f1 - 0.2973621103117506


Изменение порога дает некоторое улучшение, но в сравнении с взвешиванием, незначительное.

In [61]:
def upsample(features, target, repeat):
    
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]
    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    features_upsampled, target_upsampled = shuffle(features_upsampled, target_upsampled, random_state=42)
    
    return features_upsampled, target_upsampled



def downsample(features, target, fraction):
    
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]
    features_downsampled = pd.concat([features_zeros.sample(frac=fraction, random_state=42)] + [features_ones])
    target_downsampled = pd.concat([target_zeros.sample(frac=fraction, random_state=42)] + [target_ones])
    features_downsampled, target_downsampled = shuffle(features_downsampled, target_downsampled, random_state=42)
    
    return features_downsampled, target_downsampled

In [72]:
models = [RandomForestClassifier(n_estimators=100, random_state=42), 
          DecisionTreeClassifier(max_depth=3, random_state=42), 
          LogisticRegression(random_state=42, solver='liblinear')]

X_up_train, y_up_train = upsample(X_train, y_train, 10)

for model in models:
    model.fit(X_up_train, y_up_train)
    pred_val = model.predict(X_val)
    pred_prob = model.predict_proba(X_val)[:, 1]
    print(model)
    print('F1: ', f1_score(y_val, pred_val))
    print('AUC_ROC: ', roc_auc_score(y_val, pred_prob), '\n')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
F1:  0.5938697318007662
AUC_ROC:  0.8298406903235754 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')
F1:  0.4263877715205149
AUC_ROC:  0.7792049384347673 

L

In [75]:
X_dw_train, y_dw_train = downsample(X_train, y_train, 0.1)

for model in models:
    model.fit(X_dw_train, y_dw_train)
    pred_val = model.predict(X_val)
    pred_prob = model.predict_proba(X_val)[:, 1]
    print(model)
    print('F1: ', f1_score(y_val, pred_val))
    print('AUC_ROC: ', roc_auc_score(y_val, pred_prob), '\n')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
F1:  0.4495171202809483
AUC_ROC:  0.8167567016894645 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')
F1:  0.42088607594936706
AUC_ROC:  0.767318663406683 

L

Уменьшение кол-ва строк для леса и деревьев дало отрицательный результат, данных и так немного.<br>
Увеличение кол-ва строк положительно сказалось на случайном лесе, но дерево лучше срабатывало на несбалансированных данных.<br>
Логистическая регрессия получила значительные улучшения в качестве при использовании того или иного метода балансировки классов, но более всего от корректировки веса этих классов.<br>
осталось понять закономерно это или случайно и работает так только на этих данных.

А пока попробуем настроить Случайный Лес с увеличеной обучающей выборкой и Logit с взвешенными классами.

In [80]:
for d in [None, 12, 15, 18]: 
    for n in range(100, 1100, 100):
        
        rf_clf = RandomForestClassifier(max_depth=d, n_estimators=n, random_state=42)
        
        rf_clf.fit(X_up_train, y_up_train)
        pred_val = rf_clf.predict(X_val)
        pred_prob = rf_clf.predict_proba(X_val)[:, 1]
        print('D={} N={} F1: {}, AUC_ROC: {}'.format(d, n, f1_score(y_val, pred_val), roc_auc_score(y_val, pred_prob)))

D=None N=100 F1: 0.5938697318007662, AUC_ROC: 0.8298406903235754
D=None N=200 F1: 0.5875706214689266, AUC_ROC: 0.830183484217714
D=None N=300 F1: 0.5988700564971751, AUC_ROC: 0.8311526685609815
D=None N=400 F1: 0.5935727788279773, AUC_ROC: 0.831229762770105
D=None N=500 F1: 0.5909090909090909, AUC_ROC: 0.8308979823344126
D=None N=600 F1: 0.5893536121673004, AUC_ROC: 0.8324811669860571
D=None N=700 F1: 0.5920303605313093, AUC_ROC: 0.8332397189365405
D=12 N=100 F1: 0.6009104704097117, AUC_ROC: 0.8296328112954029
D=12 N=200 F1: 0.5975609756097561, AUC_ROC: 0.8290848917377035
D=12 N=300 F1: 0.6033690658499234, AUC_ROC: 0.830431287032754
D=12 N=400 F1: 0.6036585365853658, AUC_ROC: 0.8304367937619772
D=12 N=500 F1: 0.6021505376344086, AUC_ROC: 0.8297209189629727
D=12 N=600 F1: 0.6033690658499234, AUC_ROC: 0.8300733496332517
D=12 N=700 F1: 0.6042944785276074, AUC_ROC: 0.830475340866539
D=15 N=100 F1: 0.5884413309982488, AUC_ROC: 0.8300210357056322
D=15 N=200 F1: 0.5961871750433275, AUC_ROC: 0

D=12 N=700

In [88]:
for p in ['l1', 'l2']:
    for c in [1, 0.1, 0.01]:
            
        logit_clf = LogisticRegression(C=c, penalty=p, random_state=42, solver='liblinear', class_weight='balanced')
            
        logit_clf.fit(X_train, y_train)
        pred_val = logit_clf.predict(X_val)
        pred_prob = logit_clf.predict_proba(X_val)[:, 1]
            
        print('Penalty={} C={} F1: {}, AUC_ROC: {}'.
              format(p, c, f1_score(y_val, pred_val), roc_auc_score(y_val, pred_prob)))

Penalty=l1 C=1 F1: 0.4794007490636705, AUC_ROC: 0.7483672547853475
Penalty=l1 C=0.1 F1: 0.48415716096324457, AUC_ROC: 0.7480175774796801
Penalty=l1 C=0.01 F1: 0.48129675810473815, AUC_ROC: 0.7441532302473622
Penalty=l2 C=1 F1: 0.47880299251870323, AUC_ROC: 0.7483947884314632
Penalty=l2 C=0.1 F1: 0.4811083123425693, AUC_ROC: 0.7487967796647504
Penalty=l2 C=0.01 F1: 0.4897435897435898, AUC_ROC: 0.7504130046917333


Penalty=l2 C=0.01

Но Случайный лес лучше.

# 4. Тестирование модели

In [99]:
rf_clf = RandomForestClassifier(max_depth=12, n_estimators=700, random_state=42)

scaler.fit(X_train_val[need_scale])
X_train_val[need_scale] = scaler.transform(X_train_val[need_scale])
X_test[need_scale] = scaler.transform(X_test[need_scale])

X_up_train, y_up_train = upsample(X_train_val, y_train_val, 10)
rf_clf.fit(X_up_train, y_up_train)

pred_test = rf_clf.predict(X_test)
pred_prob = rf_clf.predict_proba(X_test)[:, 1]
print('F1: {}, AUC_ROC: {}'.format(f1_score(y_test, pred_test), roc_auc_score(y_test, pred_prob)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the 

F1: 0.5935754189944135, AUC_ROC: 0.8475595760203956
