# Exploring how to flag the target for the data

In [1]:
# Dependencies:
# Data Wrangling:
import pandas as pd
from sqlalchemy import create_engine

# Modeling Packages:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer

import xgboost as xgb
import datetime

import os
import glob
rand_state = 1337

In [56]:
# PostgreSQL cloud server credentials:
# server ip: 34.75.124.150
# username: user
# password: DeEJNEAhy
# Data is in materialized views train_data and train_labels
engine = create_engine('postgresql://user:DeEJNEAhy@34.75.124.150/postgres')
sql_df = pd.read_sql("""
                 WITH BASE AS (
                    SELECT *
                    ,ROW_NUMBER() OVER      (
                                            PARTITION BY customer_id 
                                            ORDER BY s_2
                                            )
                    ,ROW_NUMBER() OVER      (
                                            PARTITION BY customer_id
                                            ORDER BY s_2 DESC
                                            ) last_statement_flag_drop
                    FROM TRAIN_DATA_random
                    )


                    SELECT *
                    ,CASE WHEN last_statement_flag_drop = 1 then 1 else 0 end as last_statement_flag
                    ,CASE WHEN (target = 1 AND last_statement_flag_drop = 1) then 1 else 0 end as last_statement_target
                    FROM BASE B
                    LEFT JOIN train_labels_random L
                    ON B.customer_id = L.customer_id
                 """, engine) 


# Testing positive flag on all statements, or only last statement

In [88]:
df = sql_df
df.head(5)
# Dropping columns because the dummy variables 
df = df.drop(labels=['d_63', 'd_64'], axis=1)
df.head()

Unnamed: 0,customer_id,s_2,p_2,d_39,b_1,b_2,r_1,s_3,d_41,b_3,...,d_142,d_143,d_144,d_145,row_number,last_statement_flag_drop,customer_id.1,target,last_statement_flag,last_statement_target
0,000548e99fa24cef8377e68e602e4bd70d30500a007999...,2017-03-22,0.871053,0.059789,0.123999,1.000394,0.009311,-0.080754,0.004721,0.009264,...,,0.008065,0.003143,0.009226,1,13,000548e99fa24cef8377e68e602e4bd70d30500a007999...,0,0,0
1,000548e99fa24cef8377e68e602e4bd70d30500a007999...,2017-04-04,0.874837,0.44261,0.142999,1.008372,0.009232,-0.065012,0.008361,0.004687,...,,0.002657,0.0084,0.006783,2,12,000548e99fa24cef8377e68e602e4bd70d30500a007999...,0,0,0
2,000548e99fa24cef8377e68e602e4bd70d30500a007999...,2017-05-15,0.846435,0.008253,0.012431,1.000745,0.001761,-0.072598,0.003944,0.000801,...,,0.009037,0.002422,0.005595,3,11,000548e99fa24cef8377e68e602e4bd70d30500a007999...,0,0,0
3,000548e99fa24cef8377e68e602e4bd70d30500a007999...,2017-06-09,0.819525,0.624685,0.034141,1.000321,0.00191,-0.069551,0.001054,0.009942,...,,0.001113,0.00291,0.002041,4,10,000548e99fa24cef8377e68e602e4bd70d30500a007999...,0,0,0
4,000548e99fa24cef8377e68e602e4bd70d30500a007999...,2017-07-27,0.876368,0.21407,0.020723,0.964219,0.006177,-0.078409,0.008469,0.00025,...,,0.006028,0.000716,0.000856,5,9,000548e99fa24cef8377e68e602e4bd70d30500a007999...,0,0,0


Do all imputation and categorical/numerical 

In [4]:
# Defining the categorical imputation and one-hot encoder for categorical variables.
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent"))
        # ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)), #Commented out because the categorical variables won't play nice with dummies between test/train. Retry when we do a full train model. Can impute values on test_data.csv if necessary.
    ]
)

In [5]:
# defining the numerical imputation and standard scaler for numerical variables.
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

# Executing the full_processor in one-step for numerical and categorical pipelines.

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

## Define functions for data prep

In [75]:
# Preparing the TRAINING data for creating the model.
def prep_df(df, target, target_to_drop):
    # Set index
    df = df.loc[:,~df.columns.duplicated()]

    # Drop unecessary columns
    df = df.drop(columns=["customer_id", "row_number","last_statement_flag_drop","last_statement_flag", "s_2", target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    over_threshold


    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]
    
    X = pd.get_dummies(X, drop_first=True)
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()
    
    # Split categorical and numerical columns
    cat_cols = X.select_dtypes(exclude="number").columns
    num_cols = X.select_dtypes(include="number").columns
    
    full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
    )   
    
    # Apply preprocessing
    X_processed = full_processor.fit_transform(X)
    print(X_processed.shape)
    y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
            y.values.reshape(-1, 1)
            )
    return X_processed, y_processed, cols_list
        
    

In [30]:
# Preparing the test_data.csv so it's values can be fed into the built model.
def prep_test_df(df, keep_cols):
    
    # Handling case-sensitivity
    keep_cols = keep_cols
    # # Drop columns not used in model training
    df = df[keep_cols]
    df = pd.get_dummies(df, drop_first=True)
    
    X = df
    
    # Split categorical and numerical columns
    cat_cols = X.select_dtypes(exclude="number").columns
    num_cols = X.select_dtypes(include="number").columns
    
    full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
    )
    
    # Apply preprocessing
    X_processed_test = full_processor.fit_transform(X)
    return X_processed_test

In [31]:
# Deprecated due to iterating over pre-split files.
# Now we just read in chunks from the entire test_data.csv file.

%%script false
def score_split_files(directory, model, keep_cols, test_data_col_names):
    mdf = pd.DataFrame(columns=['customer_id', 's_2', 'pred', 'proba'])
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            print("Working on " + f)
            df_pred = pd.read_csv(f)
            df_pred.columns = test_data_col_names
            X_processed_test = prep_test_df(df_pred, keep_cols=keep_cols)
            preds = model.predict(X_processed_test)
            proba = model.predict_proba(X_processed_test)
            df_c = df_pred[['customer_id', 's_2']]
            df_c = pd.concat([df_c, pd.DataFrame(preds, columns=['pred']), pd.DataFrame(proba, columns=['proba_inv', 'proba'])], axis=1)
            mdf = pd.concat([mdf, df_c])
            del [[df_c,df_pred]]
    return mdf

UsageError: Line magic function `%%script` not found.


In [32]:
# Feeding the test_data into the model, tabulating results, and building a df. Then saving the df to a .csv file.
def score_split_files(path, model, keep_cols, split_num_lines=500000):
    current_position = 0 #defines starting position and keeps track of where in file to read
    df_columns = None #object to hold the col names collected from the first df chunk
    
    # Define the result mdf
    mdf = pd.DataFrame(columns=['customer_id', 's_2', 'pred', 'proba'])
    
    # Get chunks from the test_data.csv and send them to the model
    while True:
        try:
            df_chunk = pd.read_csv(path, skiprows=current_position, nrows=split_num_lines)
            df_chunk.columns = df_chunk.columns.str.lower()
            if current_position == 0:
                df_columns = df_chunk.columns
            else:
                df_chunk.columns = df_columns

            # Function to prep the test_data
            X_processed_test = prep_test_df(df_chunk, keep_cols=keep_cols)
            # Predicting outcomes from test_data
            preds = model.predict(X_processed_test)
            #Predicting probabilities from test_data
            proba = model.predict_proba(X_processed_test)
            # Creating df to concat later. Getting date and customer_id from original df read in from .csv
            df_c = df_chunk[['customer_id', 's_2']]
            # Concating the np arrays to df_c
            df_c = pd.concat([df_c, pd.DataFrame(preds, columns=['pred']), pd.DataFrame(proba, columns=['proba_inv', 'proba'])], axis=1)
            mdf = pd.concat([mdf, df_c])
            # Deleting the temp dfs to free up memory.
            del [[df_c,df_chunk]]

            current_position += split_num_lines #increments position by chunk size for the next loop
        except pd.errors.EmptyDataError:
            break
        
    
    
    
    return mdf

## Build the model with training data

In [33]:
# Prep the dataframe
X_processed, y_processed, cols_list = prep_df(df, target='target', target_to_drop='last_statement_target')

(120488, 155)


In [24]:
# Prepare the test/train split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, stratify=y_processed, random_state=rand_state
)

In [25]:

# Run the model

# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)
proba = xgb_cl.predict_proba(X_test)

# Score
accuracy_score(y_test, preds)

0.9218511387026094

# Predict test data, every statement flagged as target:

In [78]:
# Prep the dataframe
X_processed, y_processed, cols_list = prep_df(df, target='target', target_to_drop='last_statement_target')

(10000, 163)


In [35]:
# Init classifier
xgb_cla = xgb.XGBClassifier()

# Fit
xgb_cla.fit(X_processed, y_processed)

In [36]:
path = r'C:\Users\joebu\programming_directory\DSBA_6156_SERJ\ignore\test_data.csv'
# test_data_col_names = pd.read_csv(r'C:\Users\joebu\programming_directory\large_data_files\amex-default-prediction\test_data.csv', nrows=0, index_col=False).columns.str.lower()


results_df = score_split_files(path=path, model=xgb_cla, keep_cols=cols_list)
results_df.to_csv('./ignore/XGB_target.csv')

# Predict test data, last_staement_target as outcome:
Here will will only flag the last statement as default for each customer that did default.

In [81]:
# Prep the data with last_statement_target as outcome
X_processed, y_processed, cols_list = prep_df(df, target='last_statement_target', target_to_drop='target')

(10000, 163)


In [39]:
# Train the model with new outcome
# Init classifier
xgb_cla = xgb.XGBClassifier()

# Fit
xgb_cla.fit(X_processed, y_processed)

In [40]:
results_df = score_split_files(path=path, model=xgb_cla, keep_cols=cols_list)
results_df.to_csv('./ignore/XGB_last_statement_target.csv')

# Only training on last statements:
Here we will only consider the last statements when training the model, and drop all the other statements.

In [89]:
ls_df = df[df['last_statement_flag'] == 1]

# Prep the data with last_statement_target as outcome
X_processed, y_processed, cols_list = prep_df(ls_df, target='last_statement_target', target_to_drop='target')

(10000, 156)


In [90]:
# Init classifier
xgb_cla = xgb.XGBClassifier()

# Fit the model
xgb_cla.fit(X_processed, y_processed)

In [91]:
results_df = score_split_files(path=path, model=xgb_cla, keep_cols=cols_list)
results_df.to_csv('./ignore/XGB_only_last_statements.csv')