In [1]:
import os
import xgboost as xgb
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import f1_score


import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from dateutil.relativedelta import relativedelta
from scipy import stats
from typing import List, Any
import matplotlib.pyplot as plt
%matplotlib inline

# CHANGE PATH
path = ''

pd.set_option('display.max_columns', 300)
SEED = 22

# If you want to blend models by seed, you can add more seeds
SEED_BAGGING = [22]

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Additional functions

In [2]:
# Read_csv
def read_csv(path: str, filename: str, **kwards) -> pd.DataFrame:
    return pd.read_csv(os.path.join(path, filename), **kwards).drop_duplicates()

# Load all files in folder
def load_folder(folder_name: str) -> pd.DataFrame:
    full_path = os.path.join(path, folder_name)
    files = os.listdir(full_path)
    result_df = pd.DataFrame([])
    for file in files:
        df = read_csv(full_path, file)
        result_df = pd.concat([result_df, df])
    return result_df

# Calculate mode
def mode(x):
    return stats.mode(x)[0][0]

# Check if data has duplicates
def has_duplicates(data:pd.DataFrame) -> bool:
    return data.shape[0] != data.phone_id.nunique()

# Plot Xgboost feature importances by seed
def plot_feature_importance(model, importance_type):
    mean_importance_all_models = []
    for seed in SEED_BAGGING:
        importance = model.model_bags[seed].get_score(importance_type=importance_type)
        mean_importance_all_models.append(importance)

    df_importance = pd.DataFrame(mean_importance_all_models)
    mean_importance_all_models = dict(df_importance.mean())
    for key in mean_importance_all_models.keys():
        mean_importance_all_models[key] = round(mean_importance_all_models[key],2)

    plt.rcParams['figure.figsize'] = (8, 10)
    xgb.plot_importance(mean_importance_all_models, importance_type=importance_type,  max_num_features=20)

## Load Data

In [3]:
data = read_csv(path, 'train/train.csv')
addresses = read_csv(path, 'misc/addresses.csv')
sample_submission = read_csv(path, 'sample_submission.csv', sep=';')
shipments = load_folder('shipments')

## Merge Data

In [4]:
shipments_addresses = shipments.merge(addresses, how='left', left_on='ship_address_id', right_on='id')
print(f'shipments shape: {shipments.shape}, shipments_addresses shape: {shipments_addresses.shape}')

shipments shape: (2260686, 20), shipments_addresses shape: (2260686, 22)


In [5]:
# Creat order_completed_at_month 
shipments_addresses['order_completed_at_month'] = shipments_addresses['order_completed_at'].apply(lambda x: x[:7])

# Transform data to timestamp format
shipments_addresses['order_created_at'] = \
    shipments_addresses['order_created_at'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

# Transform data to timestamp format
shipments_addresses['order_completed_at'] = \
    shipments_addresses['order_completed_at'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

# Calculate order time in minutes
shipments_addresses['order_time_in_minutes'] = \
    (shipments_addresses['order_completed_at'] - shipments_addresses['order_created_at']).apply(lambda x: x.seconds/3600)

In [6]:
# Add difference between targets (data is sorted by phone_id, order_completed_at).
data['target_diff'] = data['target'].diff()

## Generate Features

In [7]:
def plus_month(month: str, add: int) -> str:
    return str(
        datetime.strptime(month, "%Y-%m") + np.sign(add) * relativedelta(months=add)
    )[:7]

# Sorry for large function:)
def generate_features(
    key_target: pd.DataFrame,
    df: pd.DataFrame,
    shipments_addresses: pd.DataFrame,
    months_drop: List[str],
    target_month,
) -> pd.DataFrame:
    """
    Generate Features
    :param key_target: data which contains key and target
    :param df: data with all targets info
    :param shipments_addresses: shipments data with addresses as one dataframe
    :param months_drop: list of months that will be dropped
    :param target_month: target month
    :return: DataFrame
    """
    target_month_minus2 = plus_month(target_month, -2)
    target_month_plus1 = plus_month(target_month, 1)

    data = df[~df["order_completed_at"].isin(months_drop)].copy()

    # mean target
    key_target = key_target.merge(
        data.groupby("phone_id")["target"]
        .mean()
        .reset_index()
        .rename({"target": "mean_target"}, axis=1),
        how="left",
        on="phone_id",
    )

    # number of months for each phone_id
    key_target = key_target.merge(
        data.groupby("phone_id")["target"]
        .count()
        .reset_index()
        .rename({"target": "num_months"}, axis=1),
        how="left",
        on="phone_id",
    )

    # mean target 3 last months
    key_target = key_target.merge(
        data[
            (data["order_completed_at"] >= target_month_minus2)
            & (data["order_completed_at"] <= target_month)
        ]
        .groupby("phone_id")["target"]
        .mean()
        .reset_index()
        .rename({"target": "mean_target_3m"}, axis=1),
        how="left",
        on="phone_id",
    )

    # previous target
    key_target = key_target.merge(
        data[data["order_completed_at"] == target_month][
            ["phone_id", "target", "target_diff"]
        ].rename({"target": "prev_target"}, axis=1),
        how="left",
        on="phone_id",
    )

    # cut shipments_addresses
    shipments_addresses_cut = shipments_addresses[
        shipments_addresses["order_completed_at_month"] <= target_month_plus1
    ].copy()

    # calculate different features for all avaliable periods
    shipments_addresses_agg = shipments_addresses_cut.groupby("phone_id").agg(
        {
            "platform": mode,
            "os": mode,
            "dw_kind": mode,
            "retailer": mode,
            "s.city_name": mode,
            "s.store_id": mode,
            "user_id": "nunique",
            "ship_address_id": "nunique",
            "order_time_in_minutes": "mean",
            "total_cost": ["mean", "std", sum],
            "promo_total": ["mean", "std", sum],
            "total_weight": "sum",
            "order_id": "nunique",
        }
    )

    # rename columns & merge with key_target
    shipments_addresses_agg.columns = [
        f"{i}_{j}" for i, j in shipments_addresses_agg.columns
    ]
    key_target = key_target.merge(
        shipments_addresses_agg.reset_index(), how="left", on="phone_id"
    )

    # calculate mean completed orders time (last month)
    order_time_in_minutes_last_month = (
        shipments_addresses_cut[
            shipments_addresses_cut["order_completed_at_month"] == target_month_plus1
        ]
        .groupby("phone_id")["order_time_in_minutes"]
        .mean()
        .to_frame(name="order_time_in_minutes_last_month")
        .reset_index()
    )
    key_target = key_target.merge(
        order_time_in_minutes_last_month, how="left", on="phone_id"
    )

    # calculate number of orders (last month)
    order_id_last_month = (
        shipments_addresses_cut[
            shipments_addresses_cut["order_completed_at_month"] == target_month_plus1
        ]
        .groupby("phone_id")["order_id"]
        .count()
        .to_frame(name="order_id_last_month")
        .reset_index()
    )
    key_target = key_target.merge(order_id_last_month, how="left", on="phone_id")

    # calculate mean rate & mean rate (last month)
    rate_mean = (
        shipments_addresses_cut[shipments_addresses_cut["rate"] != 0]
        .groupby("phone_id")["rate"]
        .mean()
        .reset_index()
    )
    rate_mean = rate_mean.rename({"rate": "rate_mean"}, axis=1)
    key_target = key_target.merge(rate_mean, how="left", on="phone_id")
    rate_last_month_mean = (
        shipments_addresses_cut[
            (shipments_addresses_cut["rate"] != 0)
            & (
                shipments_addresses_cut["order_completed_at_month"]
                == target_month_plus1
            )
        ]
        .groupby("phone_id")["rate"]
        .mean()
        .reset_index()
    )
    rate_last_month_mean = rate_last_month_mean.rename(
        {"rate": "rate_last_month_mean"}, axis=1
    )
    key_target = key_target.merge(rate_last_month_mean, how="left", on="phone_id")

    # calculate order_state_canceled, order_state_complete, order_state_resumed, order_state_cart for each phone_id
    order_state = (
        shipments_addresses_cut.groupby("phone_id")["s.order_state"]
        .value_counts(True)
        .unstack(level=1)
        .reset_index()
        .fillna(0)
    )
    order_state.rename(
        {
            "canceled": "order_state_canceled",
            "complete": "order_state_complete",
            "resumed": "order_state_resumed",
            "cart": "order_state_cart",
        },
        axis=1,
        inplace=True,
    )
    key_target = key_target.merge(order_state, how="left", on="phone_id")

    print(has_duplicates(key_target))

    return key_target.fillna(-1)


## Calculate Train & Validation Data

In [9]:
%%time
months_drop = ['2020-06', '2020-07']
target_month = '2020-05'

key_target = data[data.order_completed_at=='2020-07'][['phone_id', 'target']].copy()
key_target = generate_features(key_target, data, shipments_addresses, months_drop, target_month)
key_target.head()

False
CPU times: user 7min 42s, sys: 7.32 s, total: 7min 49s
Wall time: 8min 53s


Unnamed: 0,phone_id,target,mean_target,num_months,mean_target_3m,prev_target,target_diff,platform_mode,os_mode,dw_kind_mode,retailer_mode,s.city_name_mode,s.store_id_mode,user_id_nunique,ship_address_id_nunique,order_time_in_minutes_mean,total_cost_mean,total_cost_std,total_cost_sum,promo_total_mean,promo_total_std,promo_total_sum,total_weight_sum,order_id_nunique,order_time_in_minutes_last_month,order_id_last_month,rate_mean,rate_last_month_mean,order_state_canceled,order_state_cart,order_state_complete,order_state_resumed
0,19843,1.0,0.2,5.0,-1.0,0.0,0.0,web,windows,courier,METRO,Московская Область,87.0,5.0,6.0,3.15162,65.333333,50.606982,392.0,-666.666667,1032.795559,-4000.0,44723.0,6.0,17.808611,1.0,-1.0,-1.0,0.833333,0.0,0.166667,0.0
1,173074,0.0,0.0,1.0,-1.0,0.0,-1.0,web,windows,courier,METRO,Казань,62.0,1.0,1.0,0.228889,98.0,-1.0,98.0,0.0,-1.0,0.0,3225.0,1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,1.0,0.0
2,101944,0.0,0.4,5.0,-1.0,0.0,-1.0,app,other,courier,METRO,Москва,10.0,1413.0,1440.0,0.135778,85.774931,34.787842,123859.0,-1.073407,28.363009,-1550.0,27852355.0,1440.0,0.430278,2.0,5.0,-1.0,0.996537,0.0,0.003463,0.0
3,615032,0.0,0.5,2.0,-1.0,0.0,-1.0,app,android,courier,METRO,Краснодар,103.0,1.0,2.0,13.113333,79.0,111.722871,158.0,-200.0,282.842712,-400.0,28527.0,2.0,-1.0,-1.0,4.0,-1.0,0.0,0.0,1.0,0.0
4,342522,0.0,0.0,5.0,-1.0,0.0,0.0,web,windows,courier,METRO,Санкт-Петербург,83.0,1.0,1.0,19.061111,158.0,-1.0,158.0,-200.0,-1.0,-200.0,7515.0,1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,1.0,0.0


In [10]:
# Divide data by train and validation
y = key_target.target
x = key_target.drop(['target', 'phone_id'], axis=1)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=SEED)

In [11]:
x_train.head()

Unnamed: 0,mean_target,num_months,mean_target_3m,prev_target,target_diff,platform_mode,os_mode,dw_kind_mode,retailer_mode,s.city_name_mode,s.store_id_mode,user_id_nunique,ship_address_id_nunique,order_time_in_minutes_mean,total_cost_mean,total_cost_std,total_cost_sum,promo_total_mean,promo_total_std,promo_total_sum,total_weight_sum,order_id_nunique,order_time_in_minutes_last_month,order_id_last_month,rate_mean,rate_last_month_mean,order_state_canceled,order_state_cart,order_state_complete,order_state_resumed
154283,0.4,5.0,-1.0,1.0,1.0,app,android,courier,METRO,Москва,68.0,2.0,5.0,14.261944,260.0,138.771395,1300.0,0.0,0.0,0.0,242690.0,5.0,10.933194,2.0,5.0,5.0,0.2,0.0,0.8,0.0
38834,0.333333,3.0,-1.0,0.0,0.0,web,windows,courier,METRO,Москва,1.0,1.0,2.0,10.6575,49.0,69.296465,98.0,0.0,0.0,0.0,49744.0,2.0,-1.0,-1.0,3.0,-1.0,0.0,0.0,1.0,0.0
65774,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
145763,1.0,1.0,-1.0,1.0,0.0,app,ios,courier,Лента,Тюмень,145.0,1.0,5.0,11.025926,168.666667,39.601347,1012.0,0.0,0.0,0.0,60876.0,5.0,14.562037,3.0,4.75,5.0,0.0,0.0,1.0,0.0
224492,-1.0,-1.0,-1.0,-1.0,-1.0,app,android,courier,METRO,Новосибирск,110.0,1.0,1.0,5.318611,98.0,-1.0,98.0,-199.0,-1.0,-199.0,1800.0,1.0,5.318611,1.0,-1.0,-1.0,0.0,0.0,1.0,0.0


## Models: Xgboost & LigthGBM

In [12]:
# XGBoost Wrapper with categorical count encoding
class XGBWrapper(object):
    def __init__(self, params, seed):
        self.params = params
        self.rounds = 500
        self.seed = seed
        self.count_encoder_dict = {}
        self.model_bags = {}
        self.cv = None

    @staticmethod
    def prepare_data(x, y=None):
        return xgb.DMatrix(x, label=y)

    def categorical_count_encoding(self, data, fit=True):
        df = data.copy()
        categorical = df.select_dtypes(include=['object']).columns
        for f in categorical:
            if fit:
                value_counts = df[f].value_counts().to_dict()
                self.count_encoder_dict[f] = value_counts
            df[f] = df[f].map(self.count_encoder_dict[f])
            df[f] = df[f].fillna(1)
        return df

    def cross_validation(self, x_train, y_train, folds=5):
        x_train = self.categorical_count_encoding(x_train)
        dtrain = self.prepare_data(x_train, y_train)
        self.cv = xgb.cv(self.params, dtrain, num_boost_round=1200, nfold=folds, stratified=True,
                         early_stopping_rounds=50, verbose_eval=100, seed=self.seed)
        self.rounds = int(self.cv.shape[0] / (1.0 - 1.0 / folds))
        return self.rounds, "{}: {}; {}: {}".format(self.cv.columns[0], round(self.cv.iloc[-1, 0], 4),
                                                    self.cv.columns[2], round(self.cv.iloc[-1, 2], 4))

    def fit(self, x_train, y_train, bags):
        x_train = self.categorical_count_encoding(x_train, False)
        dtrain = self.prepare_data(x_train, y_train)
        watchlist = [(dtrain, 'train')]
        for bag in bags:
            self.params['seed'] = bag
            self.model_bags[bag] = xgb.train(self.params, dtrain, self.rounds, watchlist, verbose_eval=False)

    def predict(self, x_test):
        x_test = self.categorical_count_encoding(x_test, fit=False)
        dtest = self.prepare_data(x_test)
        prediction = np.zeros(x_test.shape[0])
        for model in self.model_bags.values():
            prediction += model.predict(dtest)
        return prediction / len(self.model_bags)

In [13]:
# LGBM Wrapper with categorical count encoding
class LGBWrapper(object):
    def __init__(self, params, seed):
        self.params = params
        self.rounds = 1500
        self.seed = seed
        self.count_encoder_dict = {}
        self.model_bags = {}
        self.cv = None

    @staticmethod
    def prepare_data(x, y=None):
        return lgb.Dataset(x, label=y)

    def categorical_count_encoding(self, data, fit=True):
        df = data.copy()
        categorical = df.select_dtypes(include=['object']).columns
        for f in categorical:
            if fit:
                value_counts = df[f].value_counts().to_dict()
                self.count_encoder_dict[f] = value_counts
            df[f] = df[f].map(self.count_encoder_dict[f])
            df[f] = df[f].fillna(1)
        return df

    def cross_validation(self, x_train, y_train, folds=5):
        x_train = self.categorical_count_encoding(x_train)
        dtrain = self.prepare_data(x_train, y_train)
        self.cv = lgb.cv(self.params, dtrain, num_boost_round=1000, nfold=folds, stratified=True,
                         early_stopping_rounds=50, verbose_eval=100, seed=self.seed)
        return self.rounds, ''

    def fit(self, x_train, y_train, bags):
        x_train = self.categorical_count_encoding(x_train, False)
        dtrain = self.prepare_data(x_train, y_train)
        watchlist = [(dtrain, 'train')]
        for bag in bags:
            self.params['seed'] = bag
            self.model_bags[bag] = lgb.train(self.params, dtrain, self.rounds)

    def predict(self, x_test):
        x_test = self.categorical_count_encoding(x_test, fit=False)
        dtest = self.prepare_data(x_test)
        prediction = np.zeros(x_test.shape[0])
        for model in self.model_bags.values():
            prediction += model.predict(x_test)
        return prediction / len(self.model_bags)

## Train XGBoost & Get Validation Results

In [14]:
%%time

# Train Xgboost with f1 error
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

model_config = {
            "objective": "binary:logistic",
            "booster": "gbtree",
            "feval": f1_eval,
            "eta": 0.023,
            "min_child_weight": 5,
            "subsample": 0.75,
            "colsample_bytree": 0.75,
            "max_depth": 6,
            "silent": 1,
            "seed": SEED
        }

model = XGBWrapper(model_config, SEED)
rounds, _ = model.cross_validation(x_train, y_train)
model.fit(x_train, y_train, SEED_BAGGING)


## IF you want to train ligthgbm, you can use this code (with se)
# model_config = {
#             "objective": 'binary',
#             "booster": "gbtree",
#             "eval_metric": "logloss",
#             "eta": 0.01,
#             "min_child_weight": 5,
#             "subsample": 0.8,
#             "colsample_bytree": 0.8,
#             "max_depth": 8,
#             "silent": 1,
#             "seed": SEED
#         }

# model = LGBWrapper(model_config, SEED)
# rounds, _ = model.cross_validation(x_train, y_train)
#model.fit(x_train, y_train, [22, 31, 42])

[0]	train-error:0.296436+0.000733478	test-error:0.297395+0.00178958
[100]	train-error:0.287474+0.000216063	test-error:0.291525+0.00081361
[200]	train-error:0.283826+0.000159794	test-error:0.290307+0.00154895
[300]	train-error:0.28155+0.000237521	test-error:0.290166+0.00158506
CPU times: user 15min 42s, sys: 10.7 s, total: 15min 53s
Wall time: 18min 43s


In [16]:
prediction = model.predict(x_val)

In [17]:
def get_val_threshold(y_true, y_pred):
    thres = 0
    f1 = 0
    for i in range(20, 100, 1):
        f1_new = f1_score(y_true, (y_pred>i/100).astype(int))
        if f1_new > f1:
            f1 = f1_new
            thres = i/100
    return f1, thres

In [18]:
f1, val_thres = get_val_threshold(y_val, prediction)

In [19]:
f1

0.7141663645391636

In [25]:
val_thres

0.35

## Collect Test Features & Final Predict

In [21]:
months_drop = ['2020-01', '2020-02']
target_month = '2020-07'

test_target = sample_submission.copy().rename({'Id': 'phone_id'}, axis=1)
test_target = generate_features(test_target, data, shipments_addresses, months_drop, target_month)
test_target.head()

False


Unnamed: 0,phone_id,Predicted,mean_target,num_months,mean_target_3m,prev_target,target_diff,platform_mode,os_mode,dw_kind_mode,retailer_mode,s.city_name_mode,s.store_id_mode,user_id_nunique,ship_address_id_nunique,order_time_in_minutes_mean,total_cost_mean,total_cost_std,total_cost_sum,promo_total_mean,promo_total_std,promo_total_sum,total_weight_sum,order_id_nunique,order_time_in_minutes_last_month,order_id_last_month,rate_mean,rate_last_month_mean,order_state_canceled,order_state_cart,order_state_complete,order_state_resumed
0,19843,-1.0,0.2,5.0,-1.0,1.0,1.0,web,windows,courier,METRO,Московская Область,87,7,8,4.153229,67.375,44.890144,539.0,-568.75,891.60269,-4550.0,63927,8,7.158056,2.0,5.0,5.0,0.75,0.0,0.25,0.0
1,471287,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,app,android,courier,Лента,Ульяновск,408,1,1,16.418056,199.0,-1.0,199.0,0.0,-1.0,0.0,23100,1,16.418056,1.0,5.0,5.0,0.0,0.0,1.0,0.0
2,342522,-1.0,0.2,5.0,-1.0,0.0,-1.0,web,windows,courier,METRO,Санкт-Петербург,83,1,2,10.992222,178.5,28.991378,357.0,-250.0,70.710678,-500.0,22665,2,-1.0,-1.0,-1.0,-1.0,0.0,0.0,1.0,0.0
3,457799,-1.0,1.0,1.0,-1.0,1.0,1.0,app,ios,courier,METRO,Ижевск,211,1,3,8.516389,112.0,40.841156,336.0,0.0,0.0,0.0,86800,3,3.190833,2.0,5.0,5.0,0.0,0.0,1.0,0.0
4,233778,-1.0,0.666667,3.0,-1.0,1.0,1.0,web,linux,courier,Лента,Пермь,146,1,4,6.866042,149.25,99.5,597.0,0.0,0.0,0.0,35802,4,10.960972,2.0,-1.0,-1.0,0.0,0.0,1.0,0.0


In [22]:
sample_submission['Predicted'] = (model.predict(test_target[x_train.columns]) > val_thres).astype(int)

In [23]:
sample_submission['Predicted'].value_counts(True)

1    0.665061
0    0.334939
Name: Predicted, dtype: float64

In [24]:
sample_submission.to_csv('submission_final.csv', index=False)