In [1]:
import pandas as pd
import numpy as np
import time

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer


rand_state = 1337

#### prep_df function modified to discard categorical features and impute values before delta calcs.

In [3]:
##
# Pipelines: Defining the categorical imputation and one-hot encoder for categorical variables.
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent"))
        # ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)), #Commented out because the categorical variables won't play nice with dummies between test/train. Retry when we do a full train model. Can impute values on test_data.csv if necessary.
    ]
)

# defining the numerical imputation and standard scaler for numerical variables.
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean"))] 
        #    ("scale", StandardScaler())] # don't scale prior to feature engineering
           #("scale", MinMaxScaler())]
)

# def_prep_df: Preparing the TRAINING data for creating and testing the model.
def prep_df(df, target, target_to_drop):

    # save indices
    df_index = df.index
    # save statement_age & oldest_statement columns
    statement_age_s = df['statement_age']
    oldest_statement_s = df['oldest_statement']

    # Drop columns that shouldn't be scaled or imputed
    df = df.drop(columns=["s_2", 'statement_age', 'oldest_statement', target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    over_threshold
    

    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]

    # Split categorical and numerical columns
    cat_cols_all = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87']
    cat_cols = [col for col in X.columns.str.lower() if col in cat_cols_all]
    num_cols = [col for col in X.columns.str.lower() if col not in cat_cols]
    
    # get dummies for categorical variables
    # Xcat = pd.get_dummies(X[cat_cols], columns=cat_cols, drop_first=True)
    
    # X = pd.concat([X[num_cols],Xcat], axis=1)

    X = X[num_cols]
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()

    # cat_cols = [col for col in cols_list if col not in num_cols]
   


    full_processor = ColumnTransformer(
        transformers=[
        ("numeric", numeric_pipeline, num_cols)
        # ("categorical", categorical_pipeline, cat_cols),
        ]
    )


    
    # Apply preprocessing (impute)
    X_processed = full_processor.fit_transform(X)
    X_processed = pd.concat([pd.DataFrame(X_processed, index=df_index), statement_age_s, oldest_statement_s], axis=1)
    print(X_processed.shape)

    y_processed = pd.DataFrame(y, index=df_index)
    print(y_processed.shape)
 
    
    return X_processed, y_processed, cols_list

### Create initial df to be further processed

In [4]:
df_train_x = pd.read_parquet('./../ignore/train.parquet')
df_train_x.columns = df_train_x.columns.str.lower()
df_train_x = df_train_x.sort_values(['customer_id', 's_2'])
df_train_x = df_train_x.set_index('customer_id')

df_train_y = pd.read_csv('./../ignore/train_labels.csv')
df_train_y.columns = df_train_y.columns.str.lower()
df_train_y = df_train_y.set_index('customer_id')



df_train = pd.merge(df_train_x, df_train_y, left_index=True, right_on='customer_id', how='left')

df_train['statement_age'] = (df_train.groupby(df_train.index)['s_2']
                      .rank(method='dense', ascending=False)
                      .astype(int))

oldest_statement = df_train.groupby(df_train.index)['statement_age'].max().rename('oldest_statement')
df_train =  df_train.join(oldest_statement, how='left')
                      

df_train['last_statement_target'] = df_train['target']*df_train['statement_age'].apply(lambda x: 1 if x==1 else 0)
## df_train = df_train.rename(columns={'last_statement_flag':'statement_age'})

In [5]:
df_train['oldest_statement'].value_counts()

13    5018442
12     127476
10      67210
11      65571
9       57699
8       48880
7       36386
6       33090
5       23355
4       18692
3       17334
2       12196
1        5120
Name: oldest_statement, dtype: int64

### Impute values before calculating deltas

In [6]:
# Prep the dataframe
# Note that the last column 'statement_age' is left in the dataframes for scoring, not for predicting!

# Impute numerical and drop categorical values
X_processed, y_processed, cols_list = prep_df(df_train, target='target', target_to_drop='last_statement_target')

(5531451, 159)
(5531451, 1)


### Create deltas

In [7]:
# Delta between last (statement_age == 1) and first (statement_age >= 2)
# also remove customers with only one statement
delta1 = X_processed[((X_processed['statement_age']==1) & (X_processed['oldest_statement'] >=2)) |
                                    ((X_processed['statement_age'] == X_processed['oldest_statement']) & (X_processed['oldest_statement'] >=2))]
delta1 = delta1.diff(periods=1)
delta1 = delta1[delta1['statement_age'] < 0]
delta1['statement_delta'] = 0



# # Delta between last and 2nd last statement (1 & 2), 2 & 3, 3 & 4
# # also remove customers with only one statement
# delta2 = X_processed[((X_processed['statement_age']==1) & (X_processed['oldest_statement'] >=2)) |
#                                     ((X_processed['statement_age'] == 2) & (X_processed['oldest_statement'] >=2))]
# delta2 = delta2.diff(periods=1)
# delta2 = delta2[delta2['statement_age'] < 0]
# delta2['statement_delta'] = 1
# delta_df = pd.concat([delta1, delta2], axis=0)


# delta3 = X_processed[((X_processed['statement_age']==2) & (X_processed['oldest_statement'] >=2)) |
#                                     ((X_processed['statement_age'] == 3) & (X_processed['oldest_statement'] >=2))]
# delta3 = delta3.diff(periods=1)
# delta3 = delta3[delta3['statement_age'] < 0]
# delta3['statement_delta'] = 2
# delta_df = pd.concat([delta_df, delta3], axis=0)


# delta4 = X_processed[((X_processed['statement_age']==3) & (X_processed['oldest_statement'] >=2)) |
#                                     ((X_processed['statement_age'] == 4) & (X_processed['oldest_statement'] >=2))]
# delta4 = delta4.diff(periods=1)
# delta4 = delta4[delta4['statement_age'] < 0]
# delta4['statement_delta'] = 3
# delta_df = pd.concat([delta_df, delta4], axis=0).sort_values(by=['customer_id', 'statement_delta'])

# delta_df = delta_df.drop(columns=['oldest_statement', 'statement_age'])
delta_df = delta1.drop(columns=['oldest_statement', 'statement_age'])

cols_list_mod = cols_list 
i=0
for val in cols_list_mod:
    cols_list_mod[i] = val+ '_dfl' # delta first last
    i+=1
    
cols_list_mod = cols_list + ['statement_delta']
delta_df.columns = cols_list_mod

delta_df.head()

Unnamed: 0_level_0,p_2_dfl,d_39_dfl,b_1_dfl,b_2_dfl,r_1_dfl,s_3_dfl,d_41_dfl,b_3_dfl,d_43_dfl,d_44_dfl,...,d_136_dfl,d_137_dfl,d_138_dfl,d_139_dfl,d_140_dfl,d_141_dfl,d_143_dfl,d_144_dfl,d_145_dfl,statement_delta
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,-0.003724,0.0,0.000658,0.000809,-0.003124,0.010986,0.0,0.002465,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00236,0.0,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,-0.048603,-7.0,0.008903,0.00138,0.001396,0.07571,0.0,-0.000762,-0.094038,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003142,0.0,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.004259,0.0,0.002812,0.001853,0.000495,0.0,0.0,0.00237,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.001904,0.0,0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.054334,-9.0,-0.057747,0.19413,0.003601,-0.119655,0.0,-0.00099,0.039471,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003687,0.0,0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,-0.064942,0.0,0.004246,-0.002945,-0.005995,0.059655,0.0,-0.000399,-0.017292,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003077,0.0,0


In [8]:
delta_df.to_parquet('./../ignore/train_dfl.parquet')