In [83]:
import pandas as pd
import numpy as np

In [84]:
# Defining the outcome weighting.

def conditions(x):
    # Customer has 3 statements:
    if   x == 3:   return 0.1
    elif x == 6:   return 0.15
    elif x == 9:   return 0.75
    
    # Customer has 2 statements:
    elif x == 2:   return 0.2
    elif x == 4:   return 0.8
    
    # Customer has 1 statement:
    elif x == 1:   return 1.0 
    else:          return "error"

In [85]:
# Prepare data
def data_weighting(path, nrows=None):
    # Read in the results .csv
    mdf = pd.read_csv(path, nrows=nrows)
    
    # Filter the result set down to the columns we need
    mdf = mdf[['customer_id', 'proba', 's_2']]
    
    # Sort by s_2 and grab last 3 statments for each customer. Change the value in tail() if you want to include more or less statements.
    # mdf['mean_proba'] = mdf.groupby('customer_id')['proba'].transform('mean')
    mdf = mdf.sort_values('s_2').groupby('customer_id').tail(3)
    
    # Create ordinal statement counts, with 1 being the oldest and 3 being the newest.
    mdf["statement_num"] = mdf.groupby("customer_id")["s_2"].rank(method="first", ascending=True)
    mdf['statement_count'] = mdf.groupby('customer_id')['statement_num'].transform('max')
    
    # Create a number so we can handnle the case where a customer had only 1 or 2 statements. 
    # Multiplied to give me a unique value for each case. See conditions() above.
    mdf['statement_checksum'] = (mdf['statement_count']) * mdf['statement_num']
    
    # Displaying checksum counts for n statements   
    display(mdf.groupby(['statement_count','statement_num', 'statement_checksum']).size())
    
    # Creating the weights per statement number in numpy
    func = np.vectorize(conditions)
    weights = func(mdf["statement_count"])
    # Bringing numpy array back to the pandas df
    mdf['weights'] = weights
    
    
    # Calculating the weighted sum
    mdf ['prediction'] = mdf['proba'] * mdf['weights']
    mdf = mdf[['customer_id', 'prediction']]
    
    # Grouping those weighted sums by customer_id to give granularity of 1 proba per customer
    mdf = mdf.groupby('customer_id').sum()
    
    return mdf
    

# XGB Target:

In [89]:
mdf = data_weighting(path='./ignore/XGB_target.csv')
# print(mdf['statement_key'].value_counts())
mdf.head()

statement_count  statement_num  statement_checksum
1.0              1.0            1.0                     5827
2.0              1.0            2.0                     8174
                 2.0            4.0                     8174
3.0              1.0            3.0                   910620
                 2.0            6.0                   910620
                 3.0            9.0                   910620
dtype: int64

Unnamed: 0_level_0,prediction
customer_id,Unnamed: 1_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.011509
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.00022
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.004919
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.169137
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.206078


In [90]:
# Change me to set the path name for submission
save_path = './ignore/XGB_target_weighted.csv'
mdf.to_csv(save_path)

# XGB Last Statement Target: 

In [None]:
mdf = data_weighting(path='./ignore/XGB_last_statement_target.csv')
mdf.to_csv('./ignore/XGB_last_statement_target_weighted.csv')

statement_count  statement_num  statement_checksum
1.0              1.0            1.0                     5827
2.0              1.0            2.0                     8174
                 2.0            4.0                     8174
3.0              1.0            3.0                   910620
                 2.0            6.0                   910620
                 3.0            9.0                   910620
dtype: int64

# XGB only last statements:

In [88]:
mdf = pd.read_csv('./ignore/XGB_only_last_statements.csv')
mdf = mdf[mdf.groupby('customer_id').s_2.transform('max') == mdf.s_2]
mdf = mdf[['customer_id', 'proba']]
mdf.columns = ['customer_id', 'prediction']
mdf.head()

Unnamed: 0,customer_id,prediction
8,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.016107
21,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.010916
34,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.414932
47,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.562851
60,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.997745
