In [1]:
import pandas as pd
import numpy as np

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer


rand_state = 1337

## train_df

#### prep_df function modified to discard categorical features and impute values before delta calcs.

In [3]:
##
# Pipelines: Defining the categorical imputation and one-hot encoder for categorical variables.
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent"))
        # ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)), #Commented out because the categorical variables won't play nice with dummies between test/train. Retry when we do a full train model. Can impute values on test_data.csv if necessary.
    ]
)

# defining the numerical imputation and standard scaler for numerical variables.
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean"))] 
        #    ("scale", StandardScaler())] # don't scale prior to feature engineering
           #("scale", MinMaxScaler())]
)

# def_prep_df: Preparing the TRAINING data for creating and testing the model.
def prep_df(df, target, target_to_drop):

    # save indices
    df_index = df.index
    # save statement_age & oldest_statement columns
    statement_age_s = df['statement_age']
    oldest_statement_s = df['oldest_statement']

    # Drop columns that shouldn't be scaled or imputed
    df = df.drop(columns=["s_2", 'statement_age', 'oldest_statement', target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    over_threshold
    

    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]

    # Split categorical and numerical columns
    cat_cols_all = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87']
    cat_cols = [col for col in X.columns.str.lower() if col in cat_cols_all]
    num_cols = [col for col in X.columns.str.lower() if col not in cat_cols]
    
    # get dummies for categorical variables
    # Xcat = pd.get_dummies(X[cat_cols], columns=cat_cols, drop_first=True)
    
    # X = pd.concat([X[num_cols],Xcat], axis=1)

    X = X[num_cols]
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()

    # cat_cols = [col for col in cols_list if col not in num_cols]
   


    full_processor = ColumnTransformer(
        transformers=[
        ("numeric", numeric_pipeline, num_cols)
        # ("categorical", categorical_pipeline, cat_cols),
        ]
    )


    
    # Apply preprocessing (impute)
    X_processed = full_processor.fit_transform(X)
    X_processed = pd.concat([pd.DataFrame(X_processed, index=df_index), statement_age_s, oldest_statement_s], axis=1)
    print(X_processed.shape)

    y_processed = pd.DataFrame(y, index=df_index)
    print(y_processed.shape)
 
    
    return X_processed, y_processed, cols_list





def prep_df_test(df):

    # save indices
    df_index = df.index
    # save statement_age & oldest_statement columns
    statement_age_s = df['statement_age']
    oldest_statement_s = df['oldest_statement']

    # Drop columns that shouldn't be scaled or imputed
    df = df.drop(columns=["s_2", 'statement_age', 'oldest_statement'])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    

    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]

    # Split categorical and numerical columns
    cat_cols_all = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87']
    cat_cols = [col for col in X.columns.str.lower() if col in cat_cols_all]
    num_cols = [col for col in X.columns.str.lower() if col not in cat_cols]
    
    # get dummies for categorical variables
    # Xcat = pd.get_dummies(X[cat_cols], columns=cat_cols, drop_first=True)
    
    # X = pd.concat([X[num_cols],Xcat], axis=1)

    X = X[num_cols]
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()

    # cat_cols = [col for col in cols_list if col not in num_cols]
   


    full_processor = ColumnTransformer(
        transformers=[
        ("numeric", numeric_pipeline, num_cols)
        # ("categorical", categorical_pipeline, cat_cols),
        ]
    )


    
    # Apply preprocessing (impute)
    X_processed = full_processor.fit_transform(X)
    X_processed = pd.concat([pd.DataFrame(X_processed, index=df_index), statement_age_s, oldest_statement_s], axis=1)
    print(X_processed.shape)

    y_processed = pd.DataFrame(y, index=df_index)
    print(y_processed.shape)
 
    
    return X_processed, y_processed, cols_list

#### Create initial train_df to be further processed

In [2]:
df_train_x = pd.read_parquet('./../ignore/train.parquet')
df_train_x.columns = df_train_x.columns.str.lower()
df_train_x = df_train_x.sort_values(['customer_id', 's_2'])
df_train_x = df_train_x.set_index('customer_id')

df_train_y = pd.read_csv('./../ignore/train_labels.csv')
df_train_y.columns = df_train_y.columns.str.lower()
df_train_y = df_train_y.set_index('customer_id')



df_train = pd.merge(df_train_x, df_train_y, left_index=True, right_on='customer_id', how='left')

df_train['statement_age'] = (df_train.groupby(df_train.index)['s_2']
                      .rank(method='dense', ascending=False)
                      .astype(int))

oldest_statement = df_train.groupby(df_train.index)['statement_age'].max().rename('oldest_statement')
df_train =  df_train.join(oldest_statement, how='left')
                      

df_train['last_statement_target'] = df_train['target']*df_train['statement_age'].apply(lambda x: 1 if x==1 else 0)
## df_train = df_train.rename(columns={'last_statement_flag':'statement_age'})

In [None]:
df_train['oldest_statement'].value_counts()

#### Impute train_df values before calculating deltas

In [6]:
# Prep the dataframe
# Note that the last column 'statement_age' is left in the dataframes for scoring, not for predicting!

# Impute numerical and drop categorical values
X_processed, y_processed, cols_list = prep_df(df_train, target='target', target_to_drop='last_statement_target')

(5531451, 159)
(5531451, 1)


#### Create deltas (train)

In [None]:
# Delta between last (statement_age == 1) and first (statement_age >= 2)
# also remove customers with only one statement

delta1 = X_processed[((X_processed['statement_age']==1) & (X_processed['oldest_statement'] >=2)) |
                                    ((X_processed['statement_age'] == X_processed['oldest_statement']) & (X_processed['oldest_statement'] >=2))]
delta1 = delta1.diff(periods=1)
delta1 = delta1[delta1['statement_age'] < 0]
delta1['statement_delta'] = 0



# # Delta between last and 2nd last statement (1 & 2), 2 & 3, 3 & 4
# # also remove customers with only one statement
# delta2 = X_processed[((X_processed['statement_age']==1) & (X_processed['oldest_statement'] >=2)) |
#                                     ((X_processed['statement_age'] == 2) & (X_processed['oldest_statement'] >=2))]
# delta2 = delta2.diff(periods=1)
# delta2 = delta2[delta2['statement_age'] < 0]
# delta2['statement_delta'] = 1
# delta_df = pd.concat([delta1, delta2], axis=0)


# delta3 = X_processed[((X_processed['statement_age']==2) & (X_processed['oldest_statement'] >=2)) |
#                                     ((X_processed['statement_age'] == 3) & (X_processed['oldest_statement'] >=2))]
# delta3 = delta3.diff(periods=1)
# delta3 = delta3[delta3['statement_age'] < 0]
# delta3['statement_delta'] = 2
# delta_df = pd.concat([delta_df, delta3], axis=0)


# delta4 = X_processed[((X_processed['statement_age']==3) & (X_processed['oldest_statement'] >=2)) |
#                                     ((X_processed['statement_age'] == 4) & (X_processed['oldest_statement'] >=2))]
# delta4 = delta4.diff(periods=1)
# delta4 = delta4[delta4['statement_age'] < 0]
# delta4['statement_delta'] = 3
# delta_df = pd.concat([delta_df, delta4], axis=0).sort_values(by=['customer_id', 'statement_delta'])

# delta_df = delta_df.drop(columns=['oldest_statement', 'statement_age'])
delta_df = delta1.drop(columns=['oldest_statement', 'statement_age'])

cols_list_mod = cols_list 
i=0
for val in cols_list_mod:
    cols_list_mod[i] = val+ '_dfl' # delta first last
    i+=1
    
cols_list_mod = cols_list + ['statement_delta']
delta_df.columns = cols_list_mod

delta_df.head()

In [None]:
delta_df.to_parquet('./../ignore/train_dfl.parquet')

## test_df

In [2]:
df_test = pd.read_parquet('./../ignore/test.parquet')
df_test.columns = df_test.columns.str.lower()
df_test = df_test.sort_values(['customer_id', 's_2'])
df_test = df_test.set_index('customer_id')

df_test['statement_age'] = (df_test.groupby(df_test.index)['s_2']
                      .rank(method='dense', ascending=False)
                      .astype(int))

oldest_statement = df_test.groupby(df_test.index)['statement_age'].max().rename('oldest_statement')
df_test =  df_test.join(oldest_statement, how='left')



# save indices
df_index = df_test.index
# save statement_age & oldest_statement columns
statement_age_s = df_test['statement_age']
oldest_statement_s = df_test['oldest_statement']

# Drop columns that shouldn't be scaled or imputed
df_test.drop(columns=["s_2", 'statement_age', 'oldest_statement'], inplace=True)

# Filter out categorical columns
cat_cols_to_remove = ['b_30', 'b_38', 'd_114', 'd_116', 'd_117', 'd_120', 'd_126', 'd_63', 'd_64', 'd_66', 'd_68', 'b_31', 'd_87']
cols_list = [col for col in df_test.columns.str.lower() if col not in cat_cols_to_remove]
df_test = df_test[cols_list]

# Missing values handling
missing_props = df_test.isna().mean(axis=0)
over_threshold = missing_props[missing_props >= 0.4]
df_test.drop(over_threshold.index, axis=1, inplace=True)

cols_list = list(df_test.columns.str.lower())

In [4]:
# impute 20 columns at a time as not enough memory can be allocated to do it all at once
sublist_size = 20
sublists_cols = [cols_list[x:x+sublist_size] for x in range(0, len(cols_list), sublist_size)]

def impute_numerical(df):
    imputer=SimpleImputer(strategy="mean")
    X = pd.DataFrame(imputer.fit_transform(df), columns=df.columns, index=df.index)
    return X

i = 0
for sublist in sublists_cols:
    if i == 0:
        X = impute_numerical(df_test[sublist])
    else:
        X = pd.concat([X, impute_numerical(df_test[sublist])], axis=1)
    i +=1

X_processed = pd.concat([X, statement_age_s, oldest_statement_s], axis=1)

In [7]:
customers = list(X_processed[X_processed['oldest_statement']!=1].index.unique())
customers_1stmt = list(X_processed[X_processed['oldest_statement']==1].index.unique())
# customers = [val for val in customers if val not in customers_1stmt]

sublist_size = 300000
sublists_cust = [customers[x:x+sublist_size] for x in range(0, len(customers), sublist_size)]
print('Number of customers with 2 or more statements:', len(customers))
print('Number of customers with 1 statement:', len(customers_1stmt))

Number of customers with 2 or more statements: 918794
Number of customers with 1 statement: 5827


In [8]:
def calc_lag(df,stmnt2,stmnt1): # delta = stmnt2 - stmnt1
    if stmnt1 =='oldest':
        delta = df[(df['statement_age']==stmnt2) |(df['statement_age'] == df['oldest_statement'])]
        delta = delta.diff(periods=1)
        delta = delta[delta['statement_age'] <= 0]
        delta['statement_delta'] = 0
    else:
        delta = df[(df['statement_age']==stmnt2) |(df['statement_age'] == stmnt1)]
        delta = delta.diff(periods=1)
        delta = delta[delta['statement_age'] <= 0]
        delta['statement_delta'] = stmnt2
    return delta


stmnt1 = 'oldest'
stmnt2 = 1 # 1=most recent

i=0
for sublist in sublists_cust:
    if i == 0:
        delta_df = calc_lag(X_processed[X_processed.index.isin(sublist)], stmnt2, stmnt1)
    else:
        delta_df = pd.concat([delta_df, calc_lag(X_processed[X_processed.index.isin(sublist)], stmnt2, stmnt1)], axis=0)
    i +=1

# Drop columns
delta_df.drop(columns=['statement_age', 'oldest_statement'], inplace=True)

# Add rows with all nulls for the customers with only 1 statement
delta_df = pd.concat([delta_df, pd.DataFrame(index=customers_1stmt, columns=cols_list)], axis=0)

# Impute values
imputer=SimpleImputer(strategy="mean")
out = imputer.fit_transform(delta_df)

cols_list_mod = [val+'_dfl' for val in cols_list] + ['statement_delta']
delta_df = pd.DataFrame(out, index=delta_df.index, columns=cols_list_mod)

In [9]:
delta_df.to_parquet('./../ignore/test_dfl.parquet')