In [1]:
#!pip install pylift

In [2]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import datetime
import matplotlib.pyplot as plt
import functools
import scipy as sp

from typing import List, Union, Optional, Tuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import seaborn as sns
import matplotlib.pyplot as plt

import pylift
import pylift_utils

#import catboost as cb
#import lightgbm as lgb

import causalml
import causalml.metrics as cmetrics
import causalml.inference.tree as ctree
import causalml.inference.meta.xlearner as xlearner
import causalml.inference.meta.rlearner as rlearner
import causalml.inference.meta.tlearner as tlearner

import warnings
warnings.filterwarnings("ignore")

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
Failed to import duecredit due to No module named 'duecredit'


## Libs

In [3]:
def compute_receipts_features(
                                receipts: pd.DataFrame,
                                date_to: int,
                                num_days: int,
                             ) -> pd.DataFrame:
    #date_to = datetime.datetime.combine(date_to, datetime.datetime.min.time())
    date_from = date_to - num_days
    date_mask = (receipts['date'] >= date_from) & (receipts['date'] < date_to)
    
    features = (
        receipts
        .loc[date_mask]
       # .assign(points_spent=lambda d: d['regular_points_spent'] + d['express_points_spent'])
       # .assign(points_spent_flag=lambda d: (d['points_spent'] < 0).astype(int))
       # .assign(express_points_spent_flag=lambda d: (d['express_points_spent'] < 0).astype(int))
        .groupby(by=['customer_id'])
        .agg({
            "date": ["count", "min", "max"],
            "purchase_amt": ["sum", "max", "min", "mean", "median"],
            "purchase_sum": ["sum", "max", "min", "mean", "median"],
            "discount": ["sum"]  #  "max", "min"
        })
        .assign(
            mean_time_interval=lambda d: (
                (d['date']['max'] - d['date']['min'])
                / (d['date']['count'] - 1)
            )#.apply(lambda delta: delta.total_seconds() / (24 * 3600))
        )
        .assign(
            time_since_last=lambda d: (
                date_to - d['date']['max']
            )#.apply(lambda delta: delta.total_seconds() / (24 * 3600))
        )
    )

    column_names = [
        f'rec__{col}__{agg}__{num_days}d' if agg else f'rec__{col}__{num_days}d'
        for col, agg in features.columns
    ]
    features.columns = column_names
    features.reset_index(inplace=True)

    return features

def proportion_diff_interval(success: pd.Series, nobs: pd.Series, conf_level: float = 0.05):
    assert len(success) == 2 and len(nobs) == 2
    p = np.array(success) / np.array(nobs)
    z = sp.stats.norm.ppf(1.0 - conf_level)
    diff = p[1] - p[0]
    std_dev = np.sqrt(p[0] * (1.0 - p[0]) / nobs[0] + p[1] * (1.0 - p[1]) / nobs[1])
    pvalue = 2 * sp.stats.norm.cdf(-np.abs(diff) / std_dev)
    return diff, std_dev, pvalue, (diff - z * std_dev, diff + z * std_dev)


def response_uplift(
    data: pd.DataFrame,
    col_feature: str,
    col_target: str,
    col_treatment: str = 'treatment',
    conf_level: float = 0.05,
    verbose: bool = True,
    figsize: Tuple[int, int] = (10, 7),
    plot_type: str = 'default'
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    tmp = data.copy()
    tmp['tmp_response'] = list(map(int, tmp[col_target] > 0))
    
    response_pivot = tmp.pivot_table(
        values='tmp_response', index=[col_feature], columns=[col_treatment],
        aggfunc='sum'
    )
    
    clients_pivot = tmp.pivot_table(
        values='tmp_response', index=[col_feature], columns=[col_treatment],
        aggfunc='count'
    )
    
    means = list()
    stds = list()
    pvalues = list()
    chi2_pvalues = list()
    
    for segment in response_pivot.index:
        m, std, pvalue, _ = proportion_diff_interval(response_pivot.loc[segment, :], clients_pivot.loc[segment, :], conf_level=conf_level)
        means.append(m)
        stds.append(std)
        pvalues.append(pvalue)
        
        try:
            chi2_pvalue = chi2_contingency([response_pivot.loc[segment, :], clients_pivot.loc[segment, :]])[1]
        except:
            chi2_pvalue = -1.0
        
        chi2_pvalues.append(chi2_pvalue)
        
    report = pd.DataFrame(
        data={
            'diff': means, 
            'std': stds, 
            'p-value': pvalues,
            'chi2_p-value': chi2_pvalues,
            'count_0': clients_pivot.loc[:, 0],
            'count_1': clients_pivot.loc[:, 1],
            'response_0': response_pivot.loc[:, 0],
            'response_1': response_pivot.loc[:, 1]
        }, 
        index=list(response_pivot.index)
    )
    
    if verbose:
        plt.figure(figsize=figsize)
        if plot_type == 'default':
            plt.errorbar(
                x=response_pivot.index,
                y=means,
                yerr=np.array(stds) * sp.stats.norm.ppf(1.0 - conf_level),
                fmt='ok'
            )
        elif plot_type == 'bin':
            plt.errorbar(
                x=range(len(response_pivot.index)),
                y=means,
                yerr=np.array(stds) * sp.stats.norm.ppf(1.0 - conf_level),
                fmt='ok'
            )
            plt.xticks(
                range(len(response_pivot.index)),
                response_pivot.index,
                rotation=20
            )
        plt.show()

    return report, response_pivot, clients_pivot


## Load data

### receipts

- customer_id - ID покупателя
- date - день покупки
- purchase_amt - количество купленного фисштеха (в граммах)
- purchase_sum - стоимость покупки (в оренах)

Стоит отметить, что покупатель не приобретает фисштех больше одного раза в день

In [4]:
receipts = pd.read_parquet('./final/data/receipts.parquet')
receipts.head()

Unnamed: 0,customer_id,date,purchase_amt,discount,purchase_sum
26,26,0,12.125273,0,970.021866
53,53,0,34.357035,0,2748.562788
56,56,0,8.695825,0,695.666022
58,58,0,28.299046,0,2263.923641
64,64,0,32.547511,0,2603.800857


### campaigns

- customer_id - ID покупателя
- date - первый день действия скидки
- n_offer_days - длительность (в днях) действия скидки
- target_group_flag - флаг целевой (1) / контрольной (0) группы

In [5]:
campaigns = pd.read_csv('./final/data/campaigns.csv').drop(columns=['Unnamed: 0'])
campaigns.head()

Unnamed: 0,customer_id,n_offer_days,date,target_group_flag
0,0,7,102,1
1,1,7,102,1
2,2,7,102,1
3,3,7,102,1
4,4,7,102,1


### customers

- customer_id - ID покупателя
- age - возраст покупателя
- location - место проживания покупателя

In [6]:
customers = pd.read_csv('./final/data/customers.csv').drop(columns=['Unnamed: 0'])
customers.head()

Unnamed: 0,customer_id,age,location
0,0,56,Oxenfurt
1,1,53,Hindarsfjall
2,2,41,Hindarsfjall
3,3,60,Kaer_Trolde
4,4,22,Spikeroog


### shapes: customers, campaigns, receipts 

In [7]:
customers.shape, campaigns.shape, receipts.shape

((2000000, 3), (500000, 4), (31741769, 5))

Прибыль = (
                (
                    (Выручка_в_ЦГ - Себестоимость_в_ЦГ) / Размер_ЦГ -
                    (Выручка_в_КГ - Себестоимость_в_КГ) / Размер_КГ
                ) * Размер_ЦГ
                    - Затраты_на_скидки - Затраты_на_ворон
            ) * ((Размер_ЦГ + Размер_КГ) / Размер_ЦГ)

## Cook train data 

### data = customer + receipts_features + campaigns

In [37]:
# Интервалы для расчета фич
num_days_list = [10, 100]

data = customers.copy()

print("Merging: customers -> receipts features")
for num_days in num_days_list:
    _part = compute_receipts_features(receipts, 102, num_days=num_days)
    print(f"{num_days=}: _part.shape={_part.shape}")
    data = data.merge(_part, on='customer_id', how='left')
    del _part

print("Merging: -> campaigns") 
data = data.merge(campaigns[['customer_id', 'target_group_flag']], on='customer_id', how='left').fillna(0)

print(f"data.shape={data.shape}")
data.head()

Merging: customers -> receipts features
num_days=10: _part.shape=(1044205, 17)
num_days=100: _part.shape=(1946485, 17)
Merging: -> campaigns
data.shape=(2000000, 36)


Unnamed: 0,customer_id,age,location,rec__date__count__10d,rec__date__min__10d,rec__date__max__10d,rec__purchase_amt__sum__10d,rec__purchase_amt__max__10d,rec__purchase_amt__min__10d,rec__purchase_amt__mean__10d,...,rec__purchase_amt__median__100d,rec__purchase_sum__sum__100d,rec__purchase_sum__max__100d,rec__purchase_sum__min__100d,rec__purchase_sum__mean__100d,rec__purchase_sum__median__100d,rec__discount__sum__100d,rec__mean_time_interval__100d,rec__time_since_last__100d,target_group_flag
0,0,56,Oxenfurt,1.0,93.0,93.0,12.028087,12.028087,12.028087,12.028087,...,11.397987,8963.796788,1104.252876,389.772041,814.890617,911.838955,0.0,9.0,9.0,1.0
1,1,53,Hindarsfjall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,12.411541,5652.043648,1141.662891,610.655207,942.007275,992.923293,0.0,13.2,20.0,1.0
2,2,41,Hindarsfjall,2.0,97.0,100.0,39.13365,34.363092,4.770558,19.566825,...,26.91846,16108.37486,2966.189969,381.644612,2013.546857,2153.476803,0.0,12.857143,2.0,1.0
3,3,60,Kaer_Trolde,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,32.523289,16666.748849,3391.417982,209.890698,2083.343606,2601.863154,0.0,10.285714,24.0,1.0
4,4,22,Spikeroog,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.580492,3292.878795,1780.260418,1512.618377,1646.439397,1646.439397,0.0,20.0,16.0,1.0


In [9]:
data.columns

Index(['customer_id', 'age', 'location', 'rec__date__count__10d',
       'rec__date__min__10d', 'rec__date__max__10d',
       'rec__purchase_amt__sum__10d', 'rec__purchase_amt__max__10d',
       'rec__purchase_amt__min__10d', 'rec__purchase_amt__mean__10d',
       'rec__purchase_amt__median__10d', 'rec__purchase_sum__sum__10d',
       'rec__purchase_sum__max__10d', 'rec__purchase_sum__min__10d',
       'rec__purchase_sum__mean__10d', 'rec__purchase_sum__median__10d',
       'rec__discount__sum__10d', 'rec__mean_time_interval__10d',
       'rec__time_since_last__10d', 'rec__date__count__100d',
       'rec__date__min__100d', 'rec__date__max__100d',
       'rec__purchase_amt__sum__100d', 'rec__purchase_amt__max__100d',
       'rec__purchase_amt__min__100d', 'rec__purchase_amt__mean__100d',
       'rec__purchase_amt__median__100d', 'rec__purchase_sum__sum__100d',
       'rec__purchase_sum__max__100d', 'rec__purchase_sum__min__100d',
       'rec__purchase_sum__mean__100d', 'rec__purchase_su

### data = data + profit from client

- Цена 1 г фисштеха равна 80 оренам. Себестоимость равна 52 оренам
- Отправка одного сообщения вороном стоит 1 орен
- У "Хитрого лиса" есть конкурент 😉 - сеть таверн "Семь котов", в которых тоже продают фисштех

In [38]:
MARGIN_RATE = (100 - 5200/80)/100
COMMUNICATION_COST = 1
num_days = 30

_columns = [
    'customer_id',
    'rec__purchase_sum__sum__30d',
#    'rec__purchase_amt__sum__30d',
    'rec__discount__sum__30d'
]

_part = compute_receipts_features(receipts, 102 + num_days, num_days=num_days)  # Интервал 30 дней отсчитываем от начала кампании
_part = _part[_columns].merge(campaigns[['customer_id', 'target_group_flag']], on='customer_id', how='left').fillna(0) 

_part = _part.assign(target_profit=lambda d: (
                          d['rec__purchase_sum__sum__30d'] * MARGIN_RATE 
                        - d['rec__discount__sum__30d']
                        - (d['target_group_flag']).astype(int) * COMMUNICATION_COST
                    ))

data = data.merge(_part[_columns + ['target_profit']], on='customer_id', how='left').fillna(0)
del _part

data[_columns + ['target_group_flag','target_profit']].head()

Unnamed: 0,customer_id,rec__purchase_sum__sum__30d,rec__discount__sum__30d,target_group_flag,target_profit
0,0,2538.766684,40.0,1.0,847.568339
1,1,2520.087704,0.0,1.0,881.030697
2,2,4199.119407,0.0,1.0,1468.691792
3,3,7573.037092,40.0,1.0,2609.562982
4,4,1033.298545,0.0,1.0,360.654491


In [39]:
data['target_profit'].describe()

count    2.000000e+06
mean     8.767312e+02
std      6.972007e+02
min     -1.140104e+01
25%      3.210385e+02
50%      7.932915e+02
75%      1.325049e+03
max      3.979691e+03
Name: target_profit, dtype: float64

## MODEL

In [49]:
col_target = 'target_profit'
col_treatment = 'target_group_flag'
col_features = [
       # 'customer_id', 
       'age',
       #'location', 
       'rec__date__count__10d',
       'rec__date__min__10d', 'rec__date__max__10d',
       'rec__purchase_amt__sum__10d', 'rec__purchase_amt__max__10d',
       'rec__purchase_amt__min__10d', 'rec__purchase_amt__mean__10d',
       'rec__purchase_amt__median__10d',
       'rec__purchase_sum__sum__10d',
       'rec__purchase_sum__max__10d', 'rec__purchase_sum__min__10d',
       'rec__purchase_sum__mean__10d', 'rec__purchase_sum__median__10d',
#       'rec__discount__sum__10d', 
       'rec__mean_time_interval__10d',
       'rec__time_since_last__10d', 
    
       'rec__date__count__100d',
       'rec__date__min__100d', 'rec__date__max__100d',
       'rec__purchase_amt__sum__100d', 'rec__purchase_amt__max__100d',
       'rec__purchase_amt__min__100d', 'rec__purchase_amt__mean__100d',
       'rec__purchase_amt__median__100d', 
        'rec__purchase_sum__sum__100d',
       'rec__purchase_sum__max__100d', 'rec__purchase_sum__min__100d',
       'rec__purchase_sum__mean__100d', 'rec__purchase_sum__median__100d',
#      'rec__discount__sum__100d', 
       'rec__mean_time_interval__100d',
       'rec__time_since_last__100d', 
    
       #'rec__purchase_sum__sum__30d', 'rec__purchase_amt__sum__30d',
       #'rec__discount__sum__30d',
    
       #'target_profit', 'target_group_flag'
]


In [41]:
fit_index, val_index = train_test_split(data.index, test_size=0.5, random_state=165)

In [42]:
#fit_index.shape, val_index.shape, 
data.index.shape

(2000000,)

In [15]:
import causalml.inference.meta.rlearner as rlearner
import lightgbm as lgb

In [50]:
model_rl = rlearner.BaseRRegressor(
    learner=lgb.LGBMRegressor(),
    control_name=0,
    n_fold=4,
    random_state=165
)

In [None]:
_p_score_const = data.loc[fit_index, col_treatment].value_counts(normalize=True).loc[1]
_p_scores = pd.Series(data=_p_score_const, index=fit_index)

model_rl.fit(
    X=data.loc[fit_index, col_features].values,
    treatment=data.loc[fit_index, col_treatment].values,
    y=data.loc[fit_index, col_target].values,
    p=_p_scores
)

In [None]:
_eval_data = data.loc[val_index, col_features + [col_treatment, col_target]]
_eval_data['score'] = model_rl.predict(_eval_data[col_features])
_eval_data = _eval_data[['score', col_treatment, col_target]]

In [None]:
_eval_data['score'].describe()

In [None]:
cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

# ДАЛЬШЕ НЕ ТРОГАЛ :)

## DATA FOR PREDICTION

In [None]:
receipts_1 = compute_receipts_features(receipts, 250, num_days=10)
receipts_2 = compute_receipts_features(receipts, 250, num_days=15)
receipts_3 = compute_receipts_features(receipts, 250, num_days=50)
receipts_4 = compute_receipts_features(receipts, 250, num_days=100)

In [None]:
receipts_1.to_csv('data/receipts_1.csv', index=False)
receipts_2.to_csv('data/receipts_2.csv', index=False)
receipts_3.to_csv('data/receipts_3.csv', index=False)
receipts_4.to_csv('data/receipts_4.csv', index=False)

In [None]:
#customers = customers.drop(columns = ['Unnamed: 0'])

data_for_pred = customers.copy()
for part in [receipts_1, receipts_2, receipts_3, receipts_4]:
    data_for_pred = data_for_pred.merge(part, on='customer_id', how='left') 
    
data_for_pred = data_for_pred.merge(campaigns[['customer_id', 'target_group_flag']], on='customer_id', how='left').fillna(0) 
data_for_pred

In [None]:
# возраст меняем на вот этот признак - качество немного увеличивается

data_for_pred['age_more_50'] = data_for_pred['age'].apply(lambda x: 1 if x>50 else 0)

In [None]:
data_for_pred.to_csv('data/data_for_pred.csv', index=False)

In [None]:
data_for_pred = pd.read_csv('data/data_for_pred.csv')
data_for_pred.shape

In [None]:
data_for_pred = data_part.merge(data_for_pred[list(set(data_for_pred.columns) - set(['age', 'location']))], 
                                               on='customer_id', how='left')

In [None]:
data_for_pred.to_csv('data/data_for_pred.csv', index = False)

In [None]:
data_for_pred = pd.read_csv('data/data_for_pred.csv')
data_for_pred

In [None]:
data_for_pred = data_for_pred.merge(receipts_camp, on='customer_id', how='left').fillna(0)
data_for_pred

In [None]:
data_for_pred.to_csv('data/data_for_pred.csv', index = False)

In [None]:
data_for_pred = pd.read_csv('data/data_for_pred.csv')
data_for_pred.shape

## MODEL

In [None]:
col_target = 'target_profit'
col_treatment = 'target_group_flag'

In [None]:
cols_features = [
       'location',
       'rec__date__count__10d',
       # 'rec__date__min__10d', 'rec__date__max__10d',
       'rec__purchase_amt__sum__10d', 'rec__purchase_amt__max__10d',
       'rec__purchase_amt__min__10d', 'rec__purchase_amt__mean__10d',
       'rec__purchase_amt__median__10d', 'rec__purchase_sum__sum__10d',
       'rec__purchase_sum__max__10d', 'rec__purchase_sum__min__10d',
       'rec__purchase_sum__mean__10d', 'rec__purchase_sum__median__10d',
       # 'rec__discount__sum__10d', 
       'rec__mean_time_interval__10d',
       'rec__time_since_last__10d', 
    
#       'rec__date__count__15d',
       # 'rec__date__min__15d', 'rec__date__max__15d',
#       'rec__purchase_amt__sum__15d', 'rec__purchase_amt__max__15d',
#       'rec__purchase_amt__min__15d', 'rec__purchase_amt__mean__15d',
#       'rec__purchase_amt__median__15d', 'rec__purchase_sum__sum__15d',
#       'rec__purchase_sum__max__15d', 'rec__purchase_sum__min__15d',
#       'rec__purchase_sum__mean__15d', 'rec__purchase_sum__median__15d',
       # 'rec__discount__sum__15d', 
#       'rec__mean_time_interval__15d',
#       'rec__time_since_last__15d', 
    
#       'rec__date__count__50d',
       # 'rec__date__min__50d', 'rec__date__max__50d',
#       'rec__purchase_amt__sum__50d', 'rec__purchase_amt__max__50d',
#       'rec__purchase_amt__min__50d', 'rec__purchase_amt__mean__50d',
#       'rec__purchase_amt__median__50d', 'rec__purchase_sum__sum__50d',
#       'rec__purchase_sum__max__50d', 'rec__purchase_sum__min__50d',
#       'rec__purchase_sum__mean__50d', 'rec__purchase_sum__median__50d',
       # 'rec__discount__sum__50d', 
#       'rec__mean_time_interval__50d',
#       'rec__time_since_last__50d', 
    
       'rec__date__count__100d',
       # 'rec__date__min__100d', 'rec__date__max__100d',
       'rec__purchase_amt__sum__100d', 'rec__purchase_amt__max__100d',
       'rec__purchase_amt__min__100d', 'rec__purchase_amt__mean__100d',
       'rec__purchase_amt__median__100d', 'rec__purchase_sum__sum__100d',
       'rec__purchase_sum__max__100d', 'rec__purchase_sum__min__100d',
       'rec__purchase_sum__mean__100d', 'rec__purchase_sum__median__100d',
       # 'rec__discount__sum__100d', 
       'rec__mean_time_interval__100d',
       'rec__time_since_last__100d',
    
    
 #      'rec__date__count__7d',
       # 'rec__date__min__100d', 'rec__date__max__100d',
 #      'rec__purchase_amt__sum__7d', 'rec__purchase_amt__max__7d',
 #      'rec__purchase_amt__min__7d', 'rec__purchase_amt__mean__7d',
 #      'rec__purchase_amt__median__7d', 'rec__purchase_sum__sum__7d',
 #      'rec__purchase_sum__max__7d', 'rec__purchase_sum__min__7d',
 #      'rec__purchase_sum__mean__7d', 'rec__purchase_sum__median__7d',
       # 'rec__discount__sum__100d', 
 #      'rec__mean_time_interval__7d',
 #      'rec__time_since_last__7d',
    
    
#       'age_more_50'
    
]

# 'age', 

## T-learner - можно не смотреть!

In [None]:
import causalml.inference.meta.tlearner as tlearner

In [None]:
from xgboost import XGBRegressor

In [None]:
model_n11 = tlearner.BaseTRegressor(
    learner=lgb.LGBMRegressor(),    # XGBRegressor()
    control_name=0
)

In [None]:
model_n11.fit(
    X=data.loc[fit_index, cols_features].values,
    treatment=data.loc[fit_index, col_treatment].values,
    y=data.loc[fit_index, col_target].values,
)

In [None]:
_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_n11.predict(_eval_data[cols_features])
_eval_data = _eval_data[['score', col_treatment, col_target]]

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(_eval_data['score'].head(1000), bins=100)
plt.show()

In [None]:
cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.auuc_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_all_data = data.loc[:, cols_features + [col_treatment, col_target]]
_all_data['score'] = model_n11.predict(_all_data[cols_features])
_all_data = _all_data[['score', col_treatment, col_target]]

In [None]:
cmetrics.plot_gain(
    _all_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _all_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

#### дальше все расчеты - на таргете в 7 дней (с разными фичами) - качество хуже

In [None]:
model_n11.fit(
    X=data.loc[:, cols_features].values,
    treatment=data.loc[:, col_treatment].values,
    y=data.loc[:, col_target].values,
)

In [None]:
_train_data = data.loc[:, cols_features + [col_treatment, col_target]]
_train_data['score'] = model_n11.predict(_train_data[cols_features])
_train_data = _train_data[['score', col_treatment, col_target]]

cmetrics.plot_gain(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
s = _train_data.sort_values(by = ['score'], ascending=False).reset_index().rename(columns={'index': 'customer_id'}).iloc[:1000000]
s

In [None]:
s = _train_data.sort_values(by = ['score'], ascending=False).reset_index().rename(columns={'index': 'customer_id'}).iloc[:1000000]
s['customer_id'].to_csv('data_3.csv', index=False)

In [None]:
_train_data = data.loc[:, cols_features + [col_treatment, col_target]]
_train_data['score'] = model_n11.predict(_train_data[cols_features])
_train_data = _train_data[['score', col_treatment, col_target]]

cmetrics.plot_gain(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
0.12 * 1000000

In [None]:
cmetrics.auuc_score(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_train_data

In [None]:
s = _train_data.sort_values(by = ['score'], ascending=False).reset_index().rename(columns={'index': 'customer_id'}).iloc[:120000]
s

In [None]:
s['customer_id'].to_csv('data_3.csv', index=False)

In [None]:
_train_data[_train_data.score > 0].target_profit.hist()

In [None]:
_train_data[_train_data.score < 0].target_profit.hist()

In [None]:
_train_data.score.hist(bins=30)

In [None]:
_train_data[_train_data.score > 0].score.hist()

In [None]:
_train_data[_train_data.score < 0].score.hist()

In [None]:
_train_data[(_train_data.score > 0)&(_train_data.target_profit > 0)]

In [None]:
_train_data = data.loc[fit_index, cols_features + [col_treatment, col_target]]
_train_data['score'] = model_n11.predict(_train_data[cols_features])
_train_data = _train_data[['score', col_treatment, col_target]]

cmetrics.plot_gain(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_train_data.shape

In [None]:
_train_data

In [None]:
_train_data.iloc[610000:900000]

In [None]:
_train_data[_train_data.score > -0.9].target_profit.hist()#.target_profit.sum()

In [None]:
_train_data[_train_data.score <= -0.9].target_profit.hist()

In [None]:
_train_data[_train_data.target_profit >= -5].score.hist()

In [None]:
_train_data[_train_data.target_profit < -5].score.hist()

In [None]:
_train_data.target_group_flag.value_counts()

In [None]:
df = _train_data[_train_data.score > 0].reset_index()
df['customer_id'] = df['index']
df['customer_id']

In [None]:
df['customer_id'].to_csv('project_3.csv', index = False)

## Xlearner - можно не смотреть!

In [None]:
import causalml.inference.meta.xlearner as xlearner

In [None]:
model_n4 = xlearner.BaseXRegressor(
    learner=lgb.LGBMRegressor(),
    control_name=0
)

In [None]:
data.loc[fit_index, col_treatment].value_counts(normalize=True)#.loc[1]

In [None]:
_p_score_const = data.loc[fit_index, col_treatment].value_counts(normalize=True).loc[1]
_p_scores = pd.Series(data=_p_score_const, index=fit_index)
_p_scores

In [None]:
model_n4.fit(
    X=data.loc[fit_index, cols_features].values,
    treatment=data.loc[fit_index, col_treatment].values, # .apply(lambda t: 'treatment' if t == 1 else 'control').values,
    y=data.loc[fit_index, col_target].values,
    p=_p_scores
)

In [None]:
_p_scores = pd.Series(data=_p_score_const, index=val_index)

_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_n4.predict(_eval_data[cols_features], p=_p_scores)
_eval_data = _eval_data[['score', col_treatment, col_target]]

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(_eval_data['score'].head(1000), bins=100)
plt.show()

In [None]:
cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.auuc_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target
)

In [None]:
cmetrics.qini_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

## Rlearner

In [None]:
fit_index, val_index = train_test_split(data.index, test_size=0.2, random_state=165, stratifydata.target_group_flag) #0.5

In [None]:
###############
fit_index, val_index = train_test_split(data.index, test_size=0.5, random_state=165)

In [None]:
#fit_index.shape, val_index.shape, 
data.index.shape

In [None]:
data.columns

In [None]:
fit_index, val_index = train_test_split(data_train.index, test_size=0.5, random_state=165)

In [None]:
import causalml.inference.meta.rlearner as rlearner
import lightgbm as lgb

In [None]:
model_rl = rlearner.BaseRRegressor(
    learner=lgb.LGBMRegressor(),
    control_name=0,
    n_fold=4,
    random_state=165
)

In [None]:
_p_score_const = data.loc[fit_index, col_treatment].value_counts(normalize=True).loc[1]
_p_scores = pd.Series(data=_p_score_const, index=fit_index)

model_rl.fit(
    X=data.loc[fit_index, cols_features].values,
    treatment=data.loc[fit_index, col_treatment].values,
    y=data.loc[fit_index, col_target].values,
    p=_p_scores
)

In [None]:
_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_rl.predict(_eval_data[cols_features])
_eval_data = _eval_data[['score', col_treatment, col_target]]

_eval_data

In [None]:
cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_eval_data

In [None]:
_eval_data['score'].describe()

In [None]:
count    1000000.000000
mean          -2.600219
std           30.532790
min         -819.318235
25%          -16.991519
50%           -3.586985
75%           10.014556
max         1046.801232

In [None]:
del data

In [None]:
_all_data = data_for_pred.loc[:, cols_features]
_all_data['score'] = model_rl.predict(_all_data[cols_features])
_all_data = _all_data[['score']]

_all_data

In [None]:
s = _all_data.sort_values(by = ['score'], ascending=False).reset_index().rename(columns={'index': 'customer_id'}).iloc[:135000]
s['customer_id'].to_csv('result.csv', index=False)

In [None]:
s

In [None]:
s[s.score > 0]['customer_id'].to_csv('result.csv', index=False)

# Дальше можно не смотреть!

In [None]:
cmetrics.plot_gain(
    _all_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _all_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
700000

In [None]:
s = _all_data.sort_values(by = ['score'], ascending=False).reset_index().rename(columns={'index': 'customer_id'}).iloc[:450000]
s['customer_id'].to_csv('data.csv', index=False)

In [None]:
cols_features

In [None]:
_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_rl.predict(_eval_data[cols_features])
_eval_data = _eval_data[['score', col_treatment, col_target]]

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(_eval_data['score'].head(1000), bins=100)
plt.show()

In [None]:
#  это с добавлением признаков за 30 дней после начала акции - качество хуже

cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.auuc_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_all_data = data.loc[:, cols_features + [col_treatment, col_target]]
_all_data['score'] = model_rl.predict(_all_data[cols_features])
_all_data = _all_data[['score', col_treatment, col_target]]

In [None]:
cmetrics.plot_gain(
    _all_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
s = _all_data.sort_values(by = ['score'], ascending=False).reset_index().rename(columns={'index': 'customer_id'}).iloc[:700000]
s['customer_id'].to_csv('data.csv', index=False)

In [None]:
0.7 * 1000000

## TransformedOutcome

In [None]:
data['rec__discount__sum__7d'].unique()

In [None]:
 # не помогает

data['if_use_discount'] = data['rec__discount__sum__7d'].apply(lambda x: 1 if x>0 else 0) 

In [None]:
data['if_use_discount'].unique()

In [None]:
_ = response_uplift(
    data=data,            # .loc[fit_index]
    col_feature='rec__date__count__7d',
    col_target=col_target,
    col_treatment=col_treatment
)

In [None]:
cols_features = [
       'location', 'rec__date__count__30d',
       # 'rec__date__min__60d', 'rec__date__max__60d',
       'rec__purchase_amt__sum__30d', 'rec__purchase_amt__max__30d',
       'rec__purchase_amt__min__30d', 'rec__purchase_amt__mean__30d',
       'rec__purchase_amt__median__30d', 'rec__purchase_sum__sum__30d',
       'rec__purchase_sum__max__30d', 'rec__purchase_sum__min__30d',
       'rec__purchase_sum__mean__30d', 'rec__purchase_sum__median__30d',
       #'rec__discount__sum__60d', 
       'rec__mean_time_interval__30d',
       'rec__time_since_last__30d', 
    
       'rec__date__count__15d',
       # 'rec__date__min__15d', 'rec__date__max__15d',
       'rec__purchase_amt__sum__15d', 'rec__purchase_amt__max__15d',
       'rec__purchase_amt__min__15d', 'rec__purchase_amt__mean__15d',
       'rec__purchase_amt__median__15d', 'rec__purchase_sum__sum__15d',
       'rec__purchase_sum__max__15d', 'rec__purchase_sum__min__15d',
       'rec__purchase_sum__mean__15d', 'rec__purchase_sum__median__15d',
       #'rec__discount__sum__15d', 
       'rec__mean_time_interval__15d',
       'rec__time_since_last__15d', 
    
       # 'rec__date__count__7d',
       # 'rec__date__min__7d', 'rec__date__max__7d',
       'rec__purchase_amt__sum__7d', 'rec__purchase_amt__max__7d',
       'rec__purchase_amt__min__7d', 'rec__purchase_amt__mean__7d',
       'rec__purchase_amt__median__7d', 'rec__purchase_sum__sum__7d',
       'rec__purchase_sum__max__7d', 'rec__purchase_sum__min__7d',
       'rec__purchase_sum__mean__7d', 'rec__purchase_sum__median__7d',
       # 'rec__discount__sum__7d', 
       # 'rec__mean_time_interval__7d',
       # 'rec__time_since_last__7d'
    
       'age_more_50',
       'if_use_discount'
]

# 'age', 

In [None]:
model_n3 = pylift.TransformedOutcome(
    data.loc[fit_index, cols_features + [col_treatment, col_target]],
    col_treatment=col_treatment, col_outcome=col_target,
    test_size=2
)

In [None]:
model_n3.fit()

In [None]:
# когда таргет за 30 дней - модель не работает

model_n3.NIV(n_bins=10)

In [None]:
# все, что ниже - это с таргетом за 7 дней. С таким таргетом лучше качество показывала именно эта модель (но грейдер ничего не примимает)

In [None]:
model_n3.NIV(n_bins=10)

In [None]:
model_n3 = pylift.TransformedOutcome(
    data.loc[:, cols_features + [col_treatment, col_target]],
    col_treatment=col_treatment, col_outcome=col_target,
    test_size=2
)

model_n3.fit()

In [None]:
model_n3.NIV(n_bins=10)

In [None]:
model_n3 = pylift.TransformedOutcome(
    data.loc[:, cols_features + [col_treatment, col_target]],
    col_treatment=col_treatment, col_outcome=col_target,
    test_size=2
)

In [None]:
model_n3.fit()

In [None]:
model_n3.NIV(n_bins=10)

In [None]:
data

In [None]:
_train_data = data.loc[:, cols_features + [col_treatment, col_target]]
_train_data['score'] = model_n3.model.predict(_train_data[cols_features])
_train_data = _train_data[['score', col_treatment, col_target]]

In [None]:
cmetrics.plot_gain(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
s = _train_data.sort_values(by = ['score'], ascending=False).reset_index().rename(columns={'index': 'customer_id'}).iloc[:880000]
s['customer_id'].to_csv('data_3.csv', index=False)

In [None]:
s['customer_id']

In [None]:
_train_data = data.loc[fit_index, cols_features + [col_treatment, col_target]]
_train_data['score'] = model_n3.model.predict(_train_data[cols_features])
_train_data = _train_data[['score', col_treatment, col_target]]

In [None]:
_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_n3.model.predict(_eval_data[cols_features])
_eval_data = _eval_data[['score', col_treatment, col_target]]

In [None]:
data.loc[val_index, cols_features + [col_treatment, col_target]].target_group_flag.value_counts()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(_eval_data['score'].head(1000), bins=100)
plt.show()

In [None]:
40000000/300000

In [None]:
cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_eval_data[_eval_data.score >= 0]

In [None]:
_eval_data[_eval_data.score >= 0].sort_values(by=['score'], ascending = False)

In [None]:
_eval_data[_eval_data.score >= 0].target_group_flag.value_counts()

In [None]:
_eval_data[_eval_data.score >= 0].target_profit.hist(bins=30)

In [None]:
_eval_data[_eval_data.score < 0].target_group_flag.value_counts()

In [None]:
cmetrics.plot_gain(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.plot_gain(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_train_data = data.loc[:, cols_features + [col_treatment, col_target]]
_train_data['score'] = model_n3.model.predict(_train_data[cols_features])
_train_data = _train_data[['score', col_treatment, col_target]]

In [None]:
s = _train_data.sort_values(by = ['score'], ascending=False).reset_index().rename(columns={'index': 'customer_id'}).iloc[:580000]
s['customer_id'].to_csv('data_3.csv', index=False)

In [None]:
s

In [None]:
_train_data[_train_data.score>0].reset_index().rename(columns={'index': 'customer_id'})#['customer_id'].to_csv('data_3.csv', index=False)

In [None]:
_train_data[_train_data.score>0].target_profit.hist(bins=30)

In [None]:
_train_data[_train_data.score<=0].target_profit.hist(bins=30)

In [None]:
_train_data.score.hist(bins=50)

In [None]:
_train_data

In [None]:
cmetrics.auuc_score(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.plot_gain(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
0.8*1000000

In [None]:
cmetrics.auuc_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.auuc_score(
    _train_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_eval_data[_eval_data.score >= 0].target_profit.hist(bins = 30)

In [None]:
_eval_data[_eval_data.score < 0].target_profit.hist(bins = 30)

In [None]:
df = _eval_data[_eval_data.score >= 0].reset_index()
df['customer_id'] = df['index']
df['customer_id']

In [None]:
_train_data.sort_values(by = ['score'], ascending = False)

In [None]:
df = _train_data.sort_values(by = ['score'], ascending = False).reset_index()
df['customer_id'] = df['index']
df['customer_id'].iloc[:800000]

In [None]:
df['customer_id'].iloc[:800000].to_csv('project_3.csv', index = False)

In [None]:
84 * 0.7

In [None]:
df = _train_data[_train_data.score >= 0].reset_index()
df['customer_id'] = df['index']
df['customer_id']

In [None]:
df['customer_id'].to_csv('project_3.csv', index = False)

In [None]:
data

## Tree KL

In [None]:
_sample_treated_size = 100_000
_sample_control_size = 100_000
fit_sample_index = data.loc[fit_index, [col_treatment]].query(f'{col_treatment} == 1').sample(_sample_treated_size).index
fit_sample_index = fit_sample_index.union(
    data.loc[fit_index, [col_treatment]].query(f'{col_treatment} == 0').sample(_sample_control_size).index
)

In [None]:
_sample_treated_size = 100_00
_sample_control_size = 100_00
val_sample_index = data.loc[val_index, [col_treatment]].query(f'{col_treatment} == 1').sample(_sample_treated_size).index
val_sample_index = val_sample_index.union(
    data.loc[val_index, [col_treatment]].query(f'{col_treatment} == 0').sample(_sample_control_size).index
)

In [None]:
val_index

In [None]:
import causalml.inference.tree as ctree

In [None]:
model_n9 = ctree.UpliftRandomForestClassifier(
    control_name='control',
    evaluationFunction='KL',
    n_estimators=20,
    max_depth=3,
    min_samples_leaf=500
)

In [None]:
model_n9.fit(
    X=data.loc[fit_sample_index, cols_features].values,
    treatment=data.loc[fit_sample_index, col_treatment].apply(lambda t: 'treatment' if t == 1 else 'control').values,
    y=data.loc[fit_sample_index, col_target].values
)

In [None]:
_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_n9.predict(_eval_data[cols_features].values)
_eval_data = _eval_data[['score', col_treatment, col_target]]

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(_eval_data['score'], bins=100)
plt.show()

In [None]:
# Это если таргет 30 дней 

cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
# это уже с таргетом за 7 дней

cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.auuc_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
model_n10 = ctree.UpliftRandomForestClassifier(
    control_name='control',
    evaluationFunction='KL',
    n_estimators=30,
    max_depth=6,
    min_samples_leaf=500
)

In [None]:
model_n10.fit(
    X=data.loc[fit_sample_index, cols_features].values,
    treatment=data.loc[fit_sample_index, col_treatment].apply(lambda t: 'treatment' if t == 1 else 'control').values,
    y=data.loc[fit_sample_index, col_target].values
)

In [None]:
_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_n10.predict(_eval_data[cols_features].values)
_eval_data = _eval_data[['score', col_treatment, col_target]]

In [None]:
cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
model_n10 = ctree.UpliftRandomForestClassifier(
    control_name='control',
    evaluationFunction='DeltaDeltaP',
    n_estimators=20,
    max_depth=3,
    min_samples_leaf=500
)

In [None]:
model_n10.fit(
    X=data.loc[fit_sample_index, cols_features].values,
    treatment=data.loc[fit_sample_index, col_treatment].apply(lambda t: 'treatment' if t == 1 else 'control').values,
    y=data.loc[fit_sample_index, col_target].values
)

In [None]:
_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_n10.predict(_eval_data[cols_features].values)
_eval_data = _eval_data[['score', col_treatment, col_target]]

In [None]:
# Это если таргет 30 дней 


cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_all_data = data.loc[:, cols_features + [col_treatment, col_target]]
_all_data['score'] = model_n10.predict(_all_data[cols_features].values)
_all_data = _all_data[['score', col_treatment, col_target]]

In [None]:
cmetrics.plot_gain(
    _all_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
s = _all_data.sort_values(by = ['score'], ascending=False).reset_index().rename(columns={'index': 'customer_id'}).iloc[:400000]
s['customer_id'].to_csv('data.csv', index=False)

In [None]:
# таргет за 7 дней

cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
cmetrics.qini_score(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

# LESSON 3 - НЕ модельные подходы, но кураторы сказали, что должна быть именно модель (нехитрая)

In [None]:
model_n1 = rlearner.BaseRRegressor(
    learner=lgb.LGBMRegressor(),
    control_name=0,
    n_fold=4,
    random_state=165
)

In [None]:
data.loc[fit_index, col_treatment].value_counts(normalize=True)#.loc[1]

In [None]:
# _p_score_const = data.loc[:, col_treatment].value_counts(normalize=True).loc[1]
# _p_scores = pd.Series(data=_p_score_const, index=fit_index)
# model_n1.fit(
#     X=data.loc[fit_index, cols_features].values,
#     treatment=data.loc[fit_index, col_treatment].values,
#     y=data.loc[fit_index, col_target].values,
#     p=_p_scores
# )

In [None]:
_eval_data = data.loc[:, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_n1.predict(_eval_data[cols_features])
_eval_data = _eval_data[['score', col_treatment, col_target]]
_eval_data

In [None]:
_eval_data.score.value_counts()

In [None]:
_eval_data['customer_id'] = data['customer_id']

In [None]:
ev_sort = _eval_data.sort_values(by=['score'], ascending = False)
ev_sort

In [None]:
ev_sort[ev_sort.score > 0].customer_id.to_csv('project.csv', index = False)

In [None]:
ev_sort[ev_sort.score > 0].score.sum()

In [None]:
_p_score_const = data.loc[fit_index, col_treatment].value_counts(normalize=True).loc[1]
_p_scores = pd.Series(data=_p_score_const, index=fit_index)
model_n1.fit(
    X=data.loc[fit_index, cols_features].values,
    treatment=data.loc[fit_index, col_treatment].values,
    y=data.loc[fit_index, col_target].values,
    p=_p_scores
)

In [None]:
_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]
_eval_data['score'] = model_n1.predict(_eval_data[cols_features])
_eval_data = _eval_data[['score', col_treatment, col_target]]
_eval_data

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(_eval_data['score'], bins=20)
plt.show()

In [None]:
print(np.percentile(_eval_data['score'], 5), np.percentile(_eval_data['score'], 95))

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(_eval_data['score'], range=(-2.0, 0.4), bins=16)
plt.show()

In [None]:
_eval_data['score'].describe()

In [None]:
cmetrics.plot_gain(
    _eval_data,
    treatment_col=col_treatment,
    outcome_col=col_target,
)

### анализ составляющих

In [None]:
_target_columns = [
    'target_profit',
    'rec__purchase_sum__sum__7d',
    'rec__purchase_amt__sum__7d',
    'rec__discount__sum__7d'
]

In [None]:
_stats = data.loc[fit_index].groupby(by=['target_group_flag'])[_target_columns].mean()
_stats

In [None]:
print(
    'Up[Y] =',
    _stats.loc[1, 'target_profit'] - _stats.loc[0, 'target_profit']
)

In [None]:
print(
    'Up[Z] =',
    MARGIN_RATE * (_stats.loc[1, 'rec__purchase_sum__sum__7d'] - _stats.loc[0, 'rec__purchase_sum__sum__7d'])
)

In [None]:
print(
    'P(R = 1|T = 1)b =',
    1  * (
        _stats.loc[1, 'rec__discount__sum__7d'] - _stats.loc[0, 'rec__discount__sum__7d']
    )
)

### build redemption probability model

In [None]:
treated_fit_index = data.loc[fit_index, ['target_group_flag']].query('target_group_flag == 1').index

In [None]:
red_p_model = lgb.LGBMRegressor()

In [None]:
red_p_model.fit(
    X=data.loc[treated_fit_index, cols_features],
    y=(data.loc[treated_fit_index, 'rec__discount__sum__7d'] > 0).astype(int)
)

In [None]:
data.loc[treated_fit_index, 'rec__discount__sum__7d'] > 0

In [None]:
treated_val_index = data.loc[val_index, ['target_group_flag']].query('target_group_flag == 1').index

In [None]:
red_p_predictions = red_p_model.predict(data.loc[treated_val_index, cols_features])

In [None]:
pd.Series(red_p_predictions).describe()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(red_p_predictions, range=(0.0, 1.0), bins=20)
plt.show()

In [None]:
roc_auc_score(
    (data.loc[treated_val_index, 'rec__discount__sum__7d'] > 0).astype(int),
    red_p_predictions
)

### redemption probabilty (scores) VS profit uplift scores

In [None]:
# cols_features = cols_features_n1
# col_target = 'target_profit'
# col_treatment = 'treatment_flg'

In [None]:
len(cols_features)

In [None]:
_eval_data = data.loc[val_index, cols_features + [col_treatment, col_target]]                 # , 'rec__discount__sum__7d'
_eval_data['redemption_probabilty_score'] = -red_p_model.predict(_eval_data[cols_features])

_eval_data['profit_uplift_score'] = model_n1.predict(_eval_data[cols_features])
_eval_data['rec__discount__sum__7d_flg'] = (_eval_data['rec__discount__sum__7d'] > 0).astype(int)
_eval_data = _eval_data.drop(columns=cols_features)

In [None]:
_eval_data = data.loc[:, cols_features + [col_treatment, col_target]]                 # , 'rec__discount__sum__7d'
_eval_data['redemption_probabilty_score'] = -red_p_model.predict(_eval_data[cols_features])

_eval_data['profit_uplift_score'] = model_n1.predict(_eval_data[cols_features])
_eval_data['rec__discount__sum__7d_flg'] = (_eval_data['rec__discount__sum__7d'] > 0).astype(int)
_eval_data = _eval_data.drop(columns=cols_features)

In [None]:
_eval_data['profit_uplift_score_bin'] = pd.qcut(_eval_data['profit_uplift_score'], q=8, duplicates='drop')
_eval_data

In [None]:
# вероятность списать баллы в разрезе прогнозов uplift-модели для прибыли.

In [None]:
_pivot = _eval_data.loc[treated_val_index].groupby(by=['profit_uplift_score_bin']).agg(
    {'rec__discount__sum__7d_flg': 'mean', 'profit_uplift_score_bin': 'count'}
)
_pivot

In [None]:
# plt.plot(_pivot['rec__discount__sum__7d_flg'].reset_index())

In [None]:
_pivot['rec__discount__sum__7d_flg']

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(list(_pivot['rec__discount__sum__7d_flg']))
plt.xticks(range(len(_pivot.index)), _pivot.index, rotation=20)
plt.xlabel('profit uplift score')
plt.ylabel('empirical P(R = 1|T = 1)')
plt.show()

In [None]:
cmetrics.plot_gain(
    _eval_data[[col_treatment, col_target, 'profit_uplift_score', 'redemption_probabilty_score']],
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
0.15 * 10000000

In [None]:
cmetrics.plot_gain(
    _eval_data[[col_treatment, col_target, 'profit_uplift_score', 'redemption_probabilty_score']],
    treatment_col=col_treatment,
    outcome_col=col_target,
)

In [None]:
_eval_data[[col_treatment, col_target, 'profit_uplift_score', 'redemption_probabilty_score']]

In [None]:
_eval_data['redemption_probabilty_score']

In [None]:
_eval_data['redemption_probabilty_score_2'] = -1 * _eval_data['redemption_probabilty_score']

In [None]:
_eval_data['redemption_probabilty_score_2'].sort_values(ascending = False)

In [None]:
_eval_data['customer_id'] = _eval_data.index
_eval_data

In [None]:
_eval_data.sort_values(by=['redemption_probabilty_score_2'], ascending = False).customer_id.to_csv('project_2.csv', index=False)

In [None]:
sss = _eval_data.sort_values(by=['redemption_probabilty_score_2'], ascending = False)
sss[sss.redemption_probabilty_score_2>=0.95] #.target_profit.sum() 

In [None]:
sss[sss.redemption_probabilty_score_2>=0.95].customer_id.to_csv('project_2.csv', index=False)

In [None]:
_eval_data.sort_values(by=['redemption_probabilty_score_2'], ascending = False).tail(20)