In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from datetime import datetime
import json

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
data = pd.read_csv("train.tsv", sep='\t')

# Тут просто препроцессим данные

In [None]:
def preproc_data(data):
    data.drop(columns=["Unnamed: 0"], inplace=True)
    data["delivery_available"] = [1 if v is "True" else 0 for v in data["delivery_available"]]
    data["payment_available"] = [1 if v is "True" else 0 for v in data["payment_available"]]
    data.sort_values("date_created", inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    return data

In [None]:
data = preproc_data(data)

# Тут выделяем разные типы фич для того, чтобы напрямую потом по-разному их обрабатывать и доставать новые фичи

In [None]:
numeric_features = ["img_num", "lat", "long", "price"]
binary_features = ["delivery_available", "payment_available"]
categorical_features = ["category_id", "city", "product_type", "region", "sold_mode", "subcategory_id"]
for_num_unique_features = ["category_id", "city", "date_created", "owner_id",
                           "product_id", "product_type", "region", "subcategory_id"]

In [None]:
# эта функция для доставания разных статистих по числовым колонкам
def get_all_stats(arr):
    return [
        np.min(arr),
        np.max(arr),
        np.mean(arr),
        np.median(arr),
        np.std(arr),
        np.percentile(arr, 25),
        np.percentile(arr, 75),
        np.unique(arr).shape[0]
    ]

# эта по бинарным
def get_stats_from_bin_column(arr):
    return [
        np.mean(arr),
        np.sum(arr),
        np.std(arr),
        len(arr) - np.sum(arr)
    ]

# здесь достаются фичи по дате
def get_date_feautures(date):
    date = datetime.strptime(date, "%Y-%m-%d")
    
    return [
        date.year,
        date.month,
        date.day,
        date.weekday(),
        1 if date.weekday() in (5, 6) else 0,
        date.isocalendar()[1]
    ]

# а тут комплексная функция, как мы обрабатываем категориальные колонки
def get_categorical_features(data_subset):
    data_for_extracting_features = data_subset
    
    data_last_day = datetime.strptime(data_for_extracting_features["date_created"].values[-1], "%Y-%m-%d")
    
    features = {}
    
    # для каждого значения каждой категориальной фичи считаем разные фичи по числовым и бинарным колонкам
    for cat_feature in categorical_features:
        features[cat_feature] = {}
        
        for current_cat_feature_value, current_cat_feature_value_subset in data_for_extracting_features \
                                                                                .groupby(cat_feature):
            current_feauture_stats = []

            # по числовым достаём из соответствующей функции
            for num_feature in numeric_features:
                current_feauture_stats += get_all_stats(current_cat_feature_value_subset[num_feature])

            # по бинарным тоже
            for bin_feature in binary_featues:
                current_feauture_stats += get_stats_from_bin_column(current_cat_feature_value_subset[bin_feature])
            
            # ну и ещё по другим категориальным считаем количество уникальных
            for nu_feature in for_num_unique_features:
                current_feauture_stats += [current_cat_feature_value_subset[nu_feature].unique().shape[0]]

            # + долю конкретного значения в фиче и общее количество
            current_feauture_stats += [current_cat_feature_value_subset.shape[0] / data_subset.shape[0]]
            current_feauture_stats += [current_cat_feature_value_subset.shape[0]]
            
            # + сколько это значение существует и когда в последний раз использовалось
            first_date = datetime.strptime(current_cat_feature_value_subset["date_created"].values[0], "%Y-%m-%d")
            last_date = datetime.strptime(current_cat_feature_value_subset["date_created"].values[0], "%Y-%m-%d")
            current_feauture_stats += [(last_date - first_date).days, (data_last_day - last_date).days]

            # ну и этот большой вектор и будет оцифровкой категориальной фичи - добавим его как отдельные колонки
            features[cat_feature][current_cat_feature_value] = current_feauture_stats
            features[cat_feature][current_cat_feature_value] += [data_for_extracting_features[cat_feature]\
                                                                     .unique().shape[0]]
        
    return features, len(features[cat_feature][current_cat_feature_value])

# эта функция чтобы достать tf-idf из текстовых колонок - названия и описания
def get_tf_idfs(data_subset):
    name_text = data_subset['name_text'].values
    vectorizer_name_text = TfidfVectorizer(max_features=100, decode_error='ignore')
    vectorizer_name_text.fit(name_text)

    desc_text = data_subset['desc_text'].values
    vectorizer_desc_text = TfidfVectorizer(max_features=100, decode_error='ignore')
    vectorizer_desc_text.fit(desc_text)
    
    return vectorizer_name_text, vectorizer_desc_text

# а тут просто всё соединияем из того, что выше накатали
def get_all_features_from_row(row, calc_categorical_features, len_features,
                              vectorizer_name_text, vectorizer_desc_text):
    row_features = []

    for cat_feature in categorical_features:
        cat_value = row[cat_feature]
        try:
            row_features += calc_categorical_features[cat_feature][cat_value]
        except:
            row_features += np.ones(len_features).tolist()

        for num_feature in numeric_features:
            row_features += [row[num_feature]]

        for bin_feature in binary_featues:
            row_features += [row[bin_feature]]

        row_features += get_date_feautures(row["date_created"])
        
        row_features += vectorizer_name_text.transform([row["name_text"]]).toarray()[0].tolist()
        row_features += vectorizer_desc_text.transform([row["desc_text"]]).toarray()[0].tolist()

    return row_features

# для таких данных хорошо проверять точность на кросс-валидации по времени, поэтому поделим на фолды по 30к элементов (цифра из головы)

In [None]:
folds = []
current_fold_rows = []

for index, row in tqdm_notebook(data.iterrows()):    
    current_fold_rows.append(row)
    
    if index % 30000 == 0 and index != 0:
        folds.append(pd.concat(current_fold_rows, axis=1).T)
        current_fold_rows = []
        
folds.append(pd.concat(current_fold_rows, axis=1).T)

In [None]:
len(folds)

## а теперь предлагается учиться на каждом префиксе по фолдам [:i], а проверять точность на i-ом фолде

In [None]:
fold_features_and_answers = []

for i in tqdm_notebook(range(1, len(folds))):
    y = folds[i]["sold_fast"]
    x = folds[i].drop(columns=["sold_fast"])
    
    historical_data = pd.concat(folds[:i])
    
    calc_categorical_features, len_features = get_categorical_features(historical_data)
    vectorizer_name_text, vectorizer_desc_text = get_tf_idfs(historical_data)
    
    x_features = []
    for index, row in x.iterrows():
        x_features.append(get_all_features_from_row(row,
                                                    calc_categorical_features,
                                                    len_features,
                                                    vectorizer_name_text,
                                                    vectorizer_desc_text
                                                   ))

    fold_features_and_answers.append((np.array(x_features), np.array(y.values.tolist())))

### поверх этого делаем grid-search по параметрам lgbm

In [None]:
from sklearn.model_selection import ParameterGrid
import lightgbm
from sklearn.metrics import roc_auc_score

In [None]:
params = {
    "boosting_type": ["dart", "gbdt"],
    "num_leaves": [31, 50, 100],
    "learning_rate": [0.05, 0.1],
    "n_estimators": [100, 500, 1000],
    "max_depth": [None, 10, 50, 100]
}

params = list(ParameterGrid(params))

In [None]:
evaluation = []

for param in tqdm_notebook(params):
    scores = []
    
    for i in range(1, len(fold_features_and_answers)):
        train_data = np.concatenate([fold[0] for fold in fold_features_and_answers[:i]])
        train_ans = np.concatenate([fold[1] for fold in fold_features_and_answers[:i]])

        current_model = lightgbm.LGBMClassifier(**param)
        current_model.fit(train_data, train_ans)
        prediction = current_model.predict_proba(fold_features_and_answers[i][0])
        prediction = prediction[:, 1]

        score = roc_auc_score(fold_features_and_answers[i][1], prediction)
        scores.append(score)
        
    evaluation.append((np.mean(scores), param, current_model))
    print(np.mean(scores), scores)

# теперь предсказываем ответ на тестовых данных

In [None]:
test_data = pd.read_csv("test_nolabel.tsv", sep='\t')
test_data = preproc_data(test_data)

In [None]:
ans_features = []

# берём за исторические данные все данные из train
historical_data = np.concatenate(folds)

calc_categorical_features, len_features = get_categorical_features(historical_data)
vectorizer_name_text, vectorizer_desc_text = get_tf_idfs(historical_data)

for index, row in tqdm_notebook(test_data.iterrows()):
    ans_features.append(get_all_features_from_row(row,
                                                calc_categorical_features,
                                                len_features,
                                                vectorizer_name_text,
                                                vectorizer_desc_text
                                               ))
    
ans_features = np.array(ans_features)

In [None]:
# берём лучшую модель и предсказываем вероятности
prediction = list(sorted(evaluation, key=lambda x: x[0]))[-1][2].predict_proba(ans_features)[:, 1]

In [None]:
answer = test_data[["product_id"]]
answer["score"] = prediction
answer.to_csv("answer.csv", index=False)