In [36]:
import pandas as pd
import numpy as np

In [None]:
# Defining the outcome weighting. 
def conditions(x):
    if   x == 1:   return 0.1
    elif x == 2:   return 0.15
    elif x == 3:   return 0.75
    else:           return "error"


In [63]:
# Prepare data
def data_weighting(path, nrows=None):
    # Read in the results .csv
    mdf = pd.read_csv(path, nrows=nrows)
    
    # Filter the result set down to the columns we need
    mdf = mdf[['customer_id', 'proba', 's_2']]
    
    # Sort by s_2 and grab last 3 statments for each customer. Change the value in tail() if you want to include more or less statements.
    mdf = mdf.sort_values('s_2').groupby('customer_id').tail(3)
    
    # Create ordinal statement counts, with 1 being the oldest and 3 being the newest.
    mdf["statement_count"] = mdf.groupby("customer_id")["s_2"].rank(method="first", ascending=True)
    
    # Creating the weights per statement number in numpy
    func = np.vectorize(conditions)
    weights = func(mdf["statement_count"])
    # Bringing numpy array back to the pandas df
    mdf['weights'] = weights
    
    # Calculating the weighted sum
    mdf ['weighted_proba'] = mdf['proba'] * mdf['weights']
    mdf = mdf[['customer_id', 'weighted_proba']]
    
    # Grouping those weighted sums by customer_id to give granularity of 1 proba per customer
    mdf = mdf.groupby('customer_id').sum()
    
    return mdf
    

# Example below:

In [64]:
mdf = data_weighting(path='./ignore/XBG_target.csv', nrows=1000)
mdf.head()

Unnamed: 0_level_0,weighted_proba
customer_id,Unnamed: 1_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.028074
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.000724
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.020628
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.567482
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.739097
