In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from catboost import CatBoostClassifier
from catboost import Pool, cv

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve, log_loss
from sklearn.model_selection import GridSearchCV

In [None]:
from tqdm.notebook import tqdm

In [None]:
# Const
CSV_FILE = "data.csv"
CAT_ENCODE = ['is_phone_bad', 'is_email_bad', 'is_first_paymant_by_paypal', 'is_ip_bad']
cat_features = ['is_phone_bad', 'is_email_bad', 'is_ip_bad', 'is_first_paymant_by_paypal',
                'locale', 'currency', 'dont_send_docs', 'is_inn_exist', 
                'is_vkontakte_id_exist', 'news_subscribe', 'is_export_to_1c', 'is_bik_exist']

In [None]:
#print(tqdm.__version__)

# Первичное изучение данных

In [None]:
data = pd.read_csv(CSV_FILE)
data

In [None]:
print('shape of data:', data.shape)
data.isna().sum(axis=0)

In [None]:
# Эти данные не заполнены (почти) => дропаем
data = data.drop_duplicates()
data = data.drop(['postcountry'], axis=1)

In [None]:
data.shape

In [None]:
data[data['score'] < 50]

In [None]:
len(data['country'].unique())

In [None]:
data['country'].value_counts()

In [None]:
data['email_domain'].value_counts()

# Предварительная обработка данных

In [None]:
data.isna().sum(axis=0)

### is_phone_bad, is_email_bad, is_first_paymant_by_paypal  - предварительное кодирование метками
None - 2     
True - 1   
False - 0

In [None]:
for i in CAT_ENCODE:
    data[i].replace({False: 0, True: 1}, inplace=True)
    data[i] = data[i].fillna(value=2)
    data[i] = data[i].astype(int)

In [None]:
data['is_phone_bad'].value_counts()

### country - пока проще дропнуть из-за смешанного англ. и русского + все равно нет нормальной корреляции

In [None]:
data = data.drop(['country'], axis=1)

### email_domain  - частотное кодирование

In [None]:
fe = data.groupby('email_domain').size()/len(data)
data['email_domain'] = data['email_domain'].map(fe)
data

In [None]:
data.isna().sum(axis=0)

# Изучение экспертного скоринга

In [None]:
data.score = data.score/100
data['score'].value_counts()

In [None]:
data['blocked'].value_counts()

In [None]:
data_score = data[(data['score'] >0.9)]
not_blocked = data_score.score[data_score['blocked'] == 0]
not_blocked = not_blocked.value_counts()
blocked = data_score.score[data_score['blocked'] == 1]
blocked = blocked.value_counts()
df = {'not blocked': not_blocked, 'blocked': blocked}
df = pd.DataFrame(df)
df.plot(kind='bar', figsize=(15, 15))
plt.title('Экспертный скоринг')
plt.show()

In [None]:
data_score = data[(data['score'] < 1)]
not_blocked = data_score.score[data_score['blocked'] == 0]
not_blocked = not_blocked.value_counts()
blocked = data_score.score[data_score['blocked'] == 1]
blocked = blocked.value_counts()
df = {'not blocked': not_blocked, 'blocked': blocked}
df = pd.DataFrame(df)
df.plot(kind='bar', figsize=(15, 15))
plt.title('Экспертный скоринг')
plt.show()

In [None]:
data.corr()

# Делим выборку на test и train

In [None]:
X = data.iloc[:, 1:-2]
y = np.array(data.iloc[:, -1])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train

# Градиентный бустинг с catboost

In [None]:
cv_dataset = Pool(data=X,
                  label=y,
                  cat_features=cat_features)
params = {"iterations": 150,
#           "depth": 2,
          "loss_function": "Logloss",
          "verbose": False,
          "custom_loss": ['AUC', 'Accuracy'],
          "roc_file": "roc-file"}

scores = cv(cv_dataset,
            params,
            fold_count=5, 
            plot="True")

In [None]:
model = CatBoostClassifier(
#    thread_count = -1,
   iterations=150,
   custom_loss=['AUC', 'Accuracy'],
   loss_function="Logloss",
   use_best_model=True
)

In [None]:
model.fit(
   X_train, y_train,
   cat_features=cat_features,
   eval_set=(X_test, y_test),
   plot=True
)

# Исследование результатов GB

In [None]:
data_score_proba = model.predict_proba(X_test)
data_score_proba = data_score_proba[:,0]
for i in range(len(data_score_proba)):
    if data_score_proba[i] > 0.95:
        data_score_proba[i] = 1
    elif data_score_proba[i] > 0.85:
        data_score_proba[i] = 0.9
    elif data_score_proba[i] > 0.75:
        data_score_proba[i] = 0.8
    elif data_score_proba[i] > 0.65:
        data_score_proba[i] = 0.7
    elif data_score_proba[i] > 0.55:
        data_score_proba[i] = 0.6
    elif data_score_proba[i] > 0.45:
        data_score_proba[i] = 0.5
    elif data_score_proba[i] > 0.35:
        data_score_proba[i] = 0.4
    elif data_score_proba[i] > 0.25:
        data_score_proba[i] = 0.3
    elif data_score_proba[i] > 0.15:
        data_score_proba[i] = 0.2
    elif data_score_proba[i] > 0.05:
        data_score_proba[i] = 0.1
    else:
        data_score_proba[i] = 0
data_score_proba = pd.DataFrame(data_score_proba)
data_score_proba['blocked'] = y_test
data_score_proba

In [None]:
data_score = data_score_proba[(data_score_proba[0] > 0.99)]
not_blocked = data_score[0][data_score['blocked'] == 0]
not_blocked = not_blocked.value_counts()
blocked = data_score[0][data_score['blocked'] == 1]
blocked = blocked.value_counts()
df = {'not blocked': not_blocked, 'blocked': blocked}
df = pd.DataFrame(df)
df.plot(kind='bar', figsize=(15, 15))
plt.title('GB скоринг')
plt.show()

In [None]:
data_score = data_score_proba[(data_score_proba[0] < 1)]
not_blocked = data_score[0][data_score['blocked'] == 0]
not_blocked = not_blocked.value_counts()
blocked = data_score[0][data_score['blocked'] == 1]
blocked = blocked.value_counts()
df = {'not blocked': not_blocked, 'blocked': blocked}
df = pd.DataFrame(df)
df.plot(kind='bar', figsize=(15, 15))
plt.title('GB скоринг')
plt.show()

# Кодирование категориальных признаков для Random forest

In [None]:
data = pd.read_csv(CSV_FILE)
# Эти данные не заполнены (почти) => дропаем
data = data.drop_duplicates()
data = data.drop(['postcountry', 'country'], axis=1)
data

In [None]:
fe = data.groupby('email_domain').size()/len(data)
data['email_domain'] = data['email_domain'].map(fe)
data

In [None]:
X = data.iloc[:, 1:-2]
y = np.array(data.iloc[:, -1])

In [None]:
X = pd.get_dummies(X, dummy_na=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train

In [None]:
X.isna().sum(axis=0)

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(np.array(X, dtype = float))
X_train = scaler.fit_transform(np.array(X_train, dtype = float))
X_test = scaler.transform(np.array(X_test, dtype=float))
X_train

# Random forest с sklearn

In [None]:
MAX_TREES = 150

values = np.arange(MAX_TREES) + 1
kf = KFold(n_splits=5, shuffle=True, random_state=1234)
global_scores = []

for train_indices, val_indices in tqdm(kf.split(X_train), total=5):
    scores = []
    
    X_train_kf = X_train[train_indices]
    y_train_kf = y_train[train_indices]
    
    X_val_kf = X_train[val_indices]
    y_val_kf = y_train[val_indices]
    
    forest = RandomForestClassifier(n_estimators=MAX_TREES)
    forest.fit(X_train_kf, y_train_kf)
    trees = forest.estimators_
    
    for number_of_trees in tqdm(values, leave=False):
        thinned_forest = RandomForestClassifier(n_estimators=number_of_trees)    
        thinned_forest.n_classes_ = 2
        thinned_forest.estimators_ = trees[:number_of_trees]
        scores.append(roc_auc_score(y_val_kf, thinned_forest.predict_proba(X_val_kf)[:, 1]))
    
    scores = np.array(scores)
    global_scores.append(scores)

global_scores = np.stack(global_scores, axis=0)

In [None]:
mean_cross_val_score = global_scores.mean(axis=0)
std_cross_val_score = global_scores.std(axis=0)

plt.figure(figsize=(15,8))
plt.title('Quality of random forest')

plt.plot(values, mean_cross_val_score, label='mean values', color='red', lw=3)
plt.fill_between(values, 
                 mean_cross_val_score - 2 * std_cross_val_score, 
                 mean_cross_val_score + 2 * std_cross_val_score, 
                 color='green', 
                 label='filled area between errors',
                 alpha=0.5)
legend_box = plt.legend(framealpha=1).get_frame()
legend_box.set_facecolor("white")
legend_box.set_edgecolor("black")
plt.xlabel('number of trees')
plt.ylabel('roc-auc')

plt.show()

In [None]:
forest = RandomForestClassifier(n_estimators=40, criterion='gini', n_jobs=-1)
forest.fit(X_train, y_train)
print(roc_auc_score(y_test, forest.predict_proba(X_test)[:, 1]))
print(f1_score(y_test, forest.predict(X_test)))
print(accuracy_score(y_test, forest.predict(X_test)))
print(log_loss(y_test, forest.predict_proba(X_test)[:, 1]))

# Исследование результатов Random Tree

In [None]:
data_score_proba = forest.predict_proba(X_test)
data_score_proba = data_score_proba[:,0]
for i in range(len(data_score_proba)):
    if data_score_proba[i] > 0.95:
        data_score_proba[i] = 1
    elif data_score_proba[i] > 0.85:
        data_score_proba[i] = 0.9
    elif data_score_proba[i] > 0.75:
        data_score_proba[i] = 0.8
    elif data_score_proba[i] > 0.65:
        data_score_proba[i] = 0.7
    elif data_score_proba[i] > 0.55:
        data_score_proba[i] = 0.6
    elif data_score_proba[i] > 0.45:
        data_score_proba[i] = 0.5
    elif data_score_proba[i] > 0.35:
        data_score_proba[i] = 0.4
    elif data_score_proba[i] > 0.25:
        data_score_proba[i] = 0.3
    elif data_score_proba[i] > 0.15:
        data_score_proba[i] = 0.2
    elif data_score_proba[i] > 0.05:
        data_score_proba[i] = 0.1
    else:
        data_score_proba[i] = 0
data_score_proba = pd.DataFrame(data_score_proba)
data_score_proba['blocked'] = y_test
data_score_proba

In [None]:
data_score = data_score_proba[(data_score_proba[0] > 0.99)]
not_blocked = data_score[0][data_score['blocked'] == 0]
not_blocked = not_blocked.value_counts()
blocked = data_score[0][data_score['blocked'] == 1]
blocked = blocked.value_counts()
df = {'not blocked': not_blocked, 'blocked': blocked}
df = pd.DataFrame(df)
df.plot(kind='bar', figsize=(15, 15))
plt.title('GB скоринг')
plt.show()

In [None]:
data_score = data_score_proba[(data_score_proba[0] < 1)]
not_blocked = data_score[0][data_score['blocked'] == 0]
not_blocked = not_blocked.value_counts()
blocked = data_score[0][data_score['blocked'] == 1]
blocked = blocked.value_counts()
df = {'not blocked': not_blocked, 'blocked': blocked}
df = pd.DataFrame(df)
df.plot(kind='bar', figsize=(15, 15))
plt.title('GB скоринг')
plt.show()