## Импорт необходимых библиотек

In [1]:
from catboost import CatBoostClassifier, Pool
import gc
import numpy as np
import pandas as pd
import random
from sklearn.metrics import recall_score
from tqdm.notebook import tqdm
import warnings

In [2]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 256)

## Подготовка данных

Открываем файлы с данными

In [3]:
postal_data = pd.read_csv('./data/postal_data.csv', index_col='index_oper')
postal_data

In [4]:
numeric = ['priority', 'weight', 'transport_pay', 'weight_mfi', 'price_mfi', 'dist_qty_oper_login_1', 'total_qty_oper_login_1', 'total_qty_oper_login_0', 'total_qty_over_index_and_type', 'total_qty_over_index']
is_wrong = ['is_wrong_sndr_name', 'is_wrong_rcpn_name', 'is_wrong_phone_number', 'is_wrong_address']

train_data = pd.read_csv('./data/train_dataset_train.csv', index_col='id', low_memory=False)
test_data = pd.read_csv('./data/test_dataset_test.csv', index_col='id', low_memory=False)

train_data.loc[train_data['index_oper']==' ', 'index_oper'] = '0'
train_data['index_oper'] = train_data['index_oper'].astype(float).astype(int)
test_data.loc[test_data['index_oper']==' ', 'index_oper'] = '0'
test_data['index_oper'] = test_data['index_oper'].astype(float).astype(int)
    
train_data

In [6]:
data = pd.concat([train_data, test_data])

In [7]:
counts = train_data['oper_type + oper_attr'].value_counts()
oper_values = list(counts[counts >= 8].index)
len(oper_values)

Кодирование столбца с типом и атрибутом операции

In [8]:
for value in tqdm(oper_values):
    data[f'is_oper_{value}'] = (data['oper_type + oper_attr'] == value)

Столбцы на основе которых будет выполняться группировка

In [9]:
cols = [
    'index_oper',
    'type',
    'priority',
    'is_privatecategory',
    'class',
    'is_in_yandex',
    'is_return',
    'weight',
    'mailtype',
    'mailctg',
    'mailrank',
    'directctg',
    'transport_pay',
    'postmark',
    'name_mfi',
    'weight_mfi',
    'price_mfi',
    'dist_qty_oper_login_1',
    'total_qty_oper_login_1',
    'total_qty_oper_login_0',
    'total_qty_over_index_and_type',
    'total_qty_over_index',
    'is_wrong_sndr_name',
    'is_wrong_rcpn_name',
    'is_wrong_phone_number',
    'is_wrong_address'
]

Группируем данные. Для операции считаем встретилась ли она у посылки на почтовом пункте

In [10]:
for value in tqdm(oper_values):
    column = f'is_oper_{value}'
    series = data.groupby(cols)[column].transform('max')
    
    train_data[column] = series.loc[train_data.index]
    test_data[column] = series.loc[test_data.index]

Случайное разбиение на обучающую и валидационную выборки с сохранением распределения по классам

In [11]:
neg_idx = list(train_data.loc[train_data.label == 0].index)
pos_idx = list(train_data.loc[train_data.label == 1].index)
random.shuffle(neg_idx, random=random.seed(56))
random.shuffle(pos_idx, random=random.seed(56))
train_sz = 0.7
train_idx = neg_idx[:int(train_sz*len(neg_idx))] + pos_idx[:int(train_sz*len(pos_idx))]
val_idx = neg_idx[int(train_sz*len(neg_idx)):] + pos_idx[int(train_sz*len(pos_idx)):]
random.shuffle(train_idx, random=random.seed(56))
random.shuffle(val_idx, random=random.seed(56))
len(train_idx), len(val_idx)

Создание датасета с признаками

In [12]:
def YN_encoder(x):
    if x == 'Y':
        return 1
    elif x == 'N':
        return 0
    else:
        return -1
    
def get_eng_count(x):
    count = 0
    for letter in x.lower():
        count += letter >= 'a' and letter <= 'z'
    return count

def get_ru_count(x):
    count = 0
    for letter in x.lower():
        count += letter >= 'а' and letter <= 'я' or letter == 'ё'
    return count

def get_chinese_count(x):
    count = 0
    for letter in x.lower():
        count += letter > u'\u4e00' and letter < u'\u9fff'
    return count

def get_digit_count(x):
    count = 0
    for letter in x.lower():
        count += letter >= '0' and letter <= '9'
    return count
    
def make_df(data):
    data[postal_data.columns] = postal_data.loc[data.index_oper].values
    
    df = pd.DataFrame()
    
    # добавляем исходные числовые признаки
    df[numeric] = data[numeric]
        
    # ширина и долгота отделения почтовой связи
    df['geo_lat'] = data['geo_lat']
    df['geo_lon'] = data['geo_lon']
    
    # новые числовые признаки. "+ 0.1" чтобы не делить на ноль
    df['total_over_inde/dist_1'] = (df['total_qty_over_index'] + 0.1) / (df['dist_qty_oper_login_1'] + 0.1)
    df['total_over_index_and_type/dist_1'] = (df['total_qty_over_index_and_type'] + 0.1) / (df['dist_qty_oper_login_1'] + 0.1)
    df['total_0/dist_1'] = (df['total_qty_oper_login_0'] + 0.1) / (df['dist_qty_oper_login_1'] + 0.1)
    df['total_1/dist_1'] = (df['total_qty_oper_login_1'] + 0.1) / (df['dist_qty_oper_login_1'] + 0.1)
    df['total_over_index/total_1'] = (df['total_qty_over_index'] + 0.1) / (df['total_qty_oper_login_1'] + 0.1)
    df['total_0/total_1'] = (df['total_qty_oper_login_0'] + 0.1) / (df['total_qty_oper_login_1'] + 0.1)
    df['total_over_index/total_0'] = (df['total_qty_over_index'] + 0.1) / (df['total_qty_oper_login_0'] + 0.1)
    
    df['ratio_qty_oper_login_1'] = df['total_qty_oper_login_1'] / df['total_qty_over_index_and_type']
    df.loc[df.total_qty_over_index_and_type == 0, 'ratio_qty_oper_login_1'] = 0.5
    df['ratio_qty_oper_login_0'] = df['total_qty_oper_login_0'] / df['total_qty_over_index_and_type']
    df.loc[df.total_qty_over_index_and_type == 0, 'ratio_qty_oper_login_0'] = 0.5
    
    # числовые признаки на основе name_mfi 
    df['name_mfi_len'] = data.name_mfi.apply(lambda x: len(x))
    df['mfi_count'] = data.name_mfi.apply(lambda x: len(x.split(',')))
    df.loc[df.weight_mfi == 0, 'mfi_count'] = 0
    df['mfi_mean_weight'] = df['weight_mfi'] / df['mfi_count']
    df.loc[df.mfi_count == 0, 'mfi_mean_weight'] = 0
    df['mfi_mean_price'] = df['price_mfi'] / df['mfi_count']
    df.loc[df.mfi_count == 0, 'mfi_mean_price'] = 0
    
    # подсчёт количества символов относящихся к разным группам
    df['eng_count'] = data['name_mfi'].apply(get_eng_count)
    df['ru_count'] = data['name_mfi'].apply(get_ru_count)
    df['chinese_count'] = data['name_mfi'].apply(get_chinese_count)
    df['digit_count'] = data['name_mfi'].apply(get_digit_count)
    
    # общее количество введенных некорректно данных
    df['is_wrong_count'] = 0
    for col in is_wrong:
        df[col] = data[col]
        df['is_wrong_count'] += data[col]
        
    # кодирование столбцов с Yes/No
    df[f'is_privatecategory'] = data['is_privatecategory'].apply(YN_encoder)
    df[f'is_in_yandex'] = data['is_in_yandex'].apply(YN_encoder)
    df[f'is_return'] = data['is_return'].apply(YN_encoder)
    
    # One-hot кодирование категориальных признаков
    for value in train_data['type'].unique():
        df[f'is_type_{value}'] = (data['type'] == value)
        
    for value in train_data['class'].unique():
        df[f'is_class_{value}'] = (data['class'] == value)
        
    for value in train_data['mailtype'].unique():
        df[f'is_mailtype_{value}'] = (data['mailtype'] == value)
        
    for value in train_data['mailctg'].unique():
        df[f'is_mailctg_{value}'] = (data['mailctg'] == value)
        
    for value in train_data['directctg'].unique():
        df[f'is_directctg_{value}'] = (data['directctg'] == value)
        
    for value in train_data['postmark'].unique():
        df[f'is_postmark_{value}'] = (data['postmark'] == value)
    
    for value in postal_data['region'].unique():
        df[f'is_region_{value}'] = (data['region'] == value)
        
    for value in index_oper_values:
        df[f'is_index_oper_{value}'] = (data['index_oper'] == value)
        
    # информация о том, какие типы (с атрибутом) операций были совершены с посылкой 
    for value in oper_values:
        column = f'is_oper_{value}'
        df[column] = data[column]
        
    if 'label' in data.columns:
        df['label'] = data['label']
    return df

train_df = make_df(train_data.loc[train_idx])
val_df = make_df(train_data.loc[val_idx])
train_df

## Обучение модели

In [13]:
params = {
    'task_type': 'CPU',
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'iterations': 1500,
    'max_depth': 10,
    'learning_rate': 0.04,
}

In [14]:
train_pool = Pool(
    train_df.drop('label', axis=1),
    train_df['label'],
)

val_pool = Pool(
    val_df.drop('label', axis=1),
    val_df['label'],
)

In [15]:
gc.collect()
model_cb = CatBoostClassifier(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, verbose=10, plot=False, use_best_model=True)
model_cb.get_feature_importance(prettified=True)

In [16]:
prediction = model_cb.predict_proba(val_pool)[:,1]
prediction 

Для валидации находим оптимальную границу, начиная с которой целевая переменная будет равна одному, и считаем Recall

In [17]:
scores = []
for threshold in tqdm(range(25)):
    scores.append((recall_score(val_df['label'], prediction > threshold / 100, average='macro'),  threshold / 100))
sorted(scores, reverse=True)[0]

## Получение предсказаний

In [18]:
test_df = make_df(test_data)
test_df

In [19]:
prediction = model_cb.predict_proba(test_df)[:,1]
prediction

In [20]:
np.sum((prediction > 0.005).astype(int))

In [21]:
pd.DataFrame({'id': test_df.index, 'label': (prediction > 0.005).astype(int)}).to_csv('./submission.csv', index=False, line_terminator='\n')