In [1]:
import pandas as pd
import numpy as np

from datetime import datetime

In [2]:
# импорт моделей
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

In [3]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import mean_squared_error, confusion_matrix

In [4]:
from sklearn.preprocessing import scale
from sklearn import metrics
from sklearn.base import clone
from sklearn import preprocessing

#графики
import pylab as pl
import matplotlib.pyplot as plt

In [5]:
def uplift_fit_predict(model, X_train, treatment_train, target_train, X_test):
    """
    Реализация простого способа построения uplift-модели.
    
    Обучаем два бинарных классификатора, которые оценивают вероятность target для клиента:
    1. с которым была произведена коммуникация (treatment=1)
    2. с которым не было коммуникации (treatment=0)
    
    В качестве оценки uplift для нового клиента берется разница оценок вероятностей:
    Predicted Uplift = P(target|treatment=1) - P(target|treatment=0)
    """
    X_treatment, y_treatment = X_train[treatment_train == 1, :], target_train[treatment_train == 1]
    X_control, y_control = X_train[treatment_train == 0, :], target_train[treatment_train == 0]
    model_treatment = clone(model).fit(X_treatment, y_treatment)
    model_control = clone(model).fit(X_control, y_control)
    predict_treatment = model_treatment.predict_proba(X_test)[:, 1]
    predict_control = model_control.predict_proba(X_test)[:, 1]
    predict_uplift = predict_treatment - predict_control
    
    return predict_uplift

In [6]:
def uplift_score(prediction, treatment, target, rate=0.3):
    """Подсчет Uplift Score"""
    order = numpy.argsort(-prediction)
    treatment_n = int((treatment == 1).sum() * rate)
    treatment_p = target[order][treatment[order] == 1][:treatment_n].mean()
    control_n = int((treatment == 0).sum() * rate)
    control_p = target[order][treatment[order] == 0][:control_n].mean()
    score = treatment_p - control_p
    
    return score

# Чтение данных

In [9]:
df_products = pd.read_csv('../data_in/products.csv')

In [10]:
df_purchases = pd.read_csv('../data_in/purchases.csv')

In [11]:
df_clients = pd.read_csv('../data_in/clients.csv', index_col='client_id')
df_train = pd.read_csv('../data_in/uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('../data_in/uplift_test.csv', index_col='client_id')

# Извлечение признаков

извличение из таблицы клиенов

In [12]:
df_clients['first_issue_unixtime'] = pd.to_datetime(df_clients['first_issue_date']).astype(int)/10**9
df_clients['first_redeem_unixtime'] = pd.to_datetime(df_clients['first_redeem_date']).astype(int)/10**9
df_features = pd.DataFrame({
    'gender_M': (df_clients['gender'] == 'M').astype(int),
    'gender_F': (df_clients['gender'] == 'F').astype(int),
    'gender_U': (df_clients['gender'] == 'U').astype(int),
    'age': df_clients['age'],
    'first_issue_time': df_clients['first_issue_unixtime'],
    'first_redeem_time': df_clients['first_redeem_unixtime'],
    'issue_redeem_delay': df_clients['first_redeem_unixtime'] - df_clients['first_issue_unixtime'],
}).fillna(0)


In [13]:
df_features['age'] = df_features['age'].apply(lambda x: 101 if x<0 or x>90 else x)
df_features['age_group_kk'] = pandas.cut(df_features['age'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 101]) 

NameError: name 'pandas' is not defined

In [None]:
df_age_group = pandas.get_dummies(df_features['age_group_kk'],
                                 prefix="age_group_kk",
                                 drop_first=True)

извличение из таблицы заказы

In [None]:
#количество всех покупок, максимальная сумма покупки, минимальная, средняя, 
df_ps = df_purchases.groupby('client_id')['purchase_sum'].agg(
    [('purchase_all_sum', lambda a: sum(list(a.unique()))), 
     ('purchase_all_min', 'min'),
     ('purchase_all_max', 'max'),
     ('purchase_all_mean', lambda a: a.unique().mean()),
     ('purchase_all_std', lambda a: a.unique().std()),
     ('transaction_count', lambda x: x.nunique())])


#df_purchase_a = df_purchas.groupby('client_id')['purchase_sum'].agg(lambda x: x.nunique())

#df_purchase_a = df_purchas.groupby('client_id')['purchase_sum'].agg(
#    {'purchase_counts': lambda x: x.nunique()})

#df_purchas.set_index('client_id', inplace=True)

regular_points_received

In [None]:
df_rpr = df_purchases.groupby('client_id')['regular_points_received'].agg(
    [('rpr_median', lambda a: a.median()), 
     ('rpr_min', 'min'),
     ('rpr_max', 'max'),
     ('rpr_mean', lambda a: a.unique().mean()),
     ('rpr_std', lambda a: a.unique().std())])

количество магазинов которые посетил клиент

In [None]:
df_store = df_purchases.groupby('client_id')['store_id'].agg([('store_count', lambda a: len(set(a)))])

# Объединение признаков покупок

In [None]:
df_normaliz = pd.concat([df_ps, df_rpr, df_store], axis=1)

# Нормализация количественных признаков

In [None]:
#Нормализация количественных признаков
normlz = ['purchase_all_sum', 'purchase_all_min', 'purchase_all_max', 
          'purchase_all_mean', 'purchase_all_std', 'transaction_count',
          'rpr_median', 'rpr_min', 'rpr_max', 'rpr_mean', 'rpr_std',
          'store_count']

df_normaliz = df_features[normlz]
df_normaliz = (df_normaliz - df_normaliz.mean()) / df_normaliz.std()
df_normaliz.head()


# Объединение всех фич

In [None]:
df_features = pd.concat([df_features, df_normaliz, df_age_group], axis=1)
df_features.head()

# Очистка названий колонок от не читаемых символов

In [None]:
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df_features.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in df_features.columns.values]

In [None]:
df_features.head()

# Удаление не нужных колонок

In [None]:
df_features.drop(['age_group_kk'], axis=1, inplace=True)

# Оценка качества моделей на валидации

In [None]:
itog_val = {}
kfold = 5
random_state = 757

In [None]:
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.3, random_state=random_state)

X = df_features.loc[indices_learn, :].values
y = df_train.loc[indices_learn, 'target'].values

print(X.shape, y.shape)

In [None]:
# Модель RandomForestClassifier
model_rfc = RandomForestClassifier(random_state=random_state,
                                   max_depth=9, 
                                   min_samples_leaf=1,
                                   min_samples_split=4,
                                   n_estimators=180)

In [None]:
# Модель GradientBoostingClassifier
model_gbt = GradientBoostingClassifier(learning_rate=0.1,
                                       min_samples_leaf=6,
                                       min_samples_split=2,
                                       n_estimators=200,
                                       random_state=random_state)

In [None]:
# Модель XGBClassifier
model_xgbc = xgb.XGBClassifier(max_depth=10, 
                               min_child_weight=1,
                               n_estimators=400, 
                               n_jobs=-1,
                               verbose=1, 
                               learning_rate=0.15,
                               seed=42, 
                               random_state=random_state)

In [None]:
# модель LightGBM
model_lgb = lgb.LGBMClassifier(silent=False,
                              max_depth=4,
                              learning_rate=0.01,
                              num_leaves=60,
                              n_estimators=300,
                              random_state=random_state)

In [None]:
scores = cross_val_score(model_rfc, X, y, cv=kfold)
itog_val['RandomForestClassifier'] = scores.mean()

In [None]:
scores = cross_val_score(model_gbt, X, y, cv=kfold)
itog_val['GradientBoostingClassifier'] = scores.mean()

In [None]:
scores = cross_val_score(model_xgbc, X, y, cv=kfold)
itog_val['XGBClassifier'] = scores.mean()

In [None]:
scores = cross_val_score(model_lgb, X, y, cv=kfold)
itog_val['LightGBM'] = scores.mean()

In [None]:
df_train.from_dict(data=itog_val, orient='index').plot(kind='bar', legend=False)

# Валидация моделей

In [None]:
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(df_train.index, test_size=0.33, random_state=random_state)


valid_uplift = uplift_fit_predict(model=GradientBoostingClassifier(),
                                  X_train=df_features.loc[indices_learn, :].fillna(0).values,
                                  treatment_train=df_train.loc[indices_learn, 'treatment_flg'].values,
                                  target_train=df_train.loc[indices_learn, 'target'].values,
                                  X_test=df_features.loc[indices_valid, :].fillna(0).values,)

valid_score = uplift_score(valid_uplift,
                           treatment=df_train.loc[indices_valid, 'treatment_flg'].values,
                           target=df_train.loc[indices_valid, 'target'].values,)
print('Результат вализации:', valid_score)

In [None]:
'''
Validation score: GradientBoostingClassifier - 0.07547646591869583
Validation score: model_gbc - 0.07364784854486317

Validation score: xgb.XGBClassifier - 
Validation score: model_xgbc - 

Validation score: RandomForestClassifier - 
Validation score: model_rfc - 

Validation score: lgb.LGBMClassifier - 
Validation score: model_lgb - 
'''

# Подготовка предсказаний для тестовых клиентов

In [None]:
test_uplift = uplift_fit_predict(
    model=GradientBoostingClassifier(),
    X_train=df_features.loc[indices_train, :].fillna(0).values,
    treatment_train=df_train.loc[indices_train, 'treatment_flg'].values,
    target_train=df_train.loc[indices_train, 'target'].values,
    X_test=df_features.loc[indices_test, :].fillna(0).values,
)

# Сохранение предсказания

In [None]:
date_current = datetime.today().strftime('%d_%m_%H')
df_submission = pandas.DataFrame({'uplift': test_uplift}, index=df_test.index)
df_submission.to_csv(f'../data_out/submission_{date_current}.csv')
print('file saved!')

# Best public csore: 0,0782