# Exploring how to flag the target for the data

In [1]:
# Dependencies:
# Data Wrangling:
import pandas as pd
from sqlalchemy import create_engine

# Modeling Packages:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer

import xgboost as xgb
import datetime

import os
import glob
rand_state = 1337

In [56]:
# PostgreSQL cloud server credentials:
# server ip: 34.75.124.150
# username: user
# password: DeEJNEAhy
# Data is in materialized views train_data and train_labels
engine = create_engine('postgresql://user:DeEJNEAhy@34.75.124.150/postgres')
sql_df = pd.read_sql("""
                 WITH BASE AS (
                    SELECT *
                    ,ROW_NUMBER() OVER      (
                                            PARTITION BY customer_id 
                                            ORDER BY s_2
                                            )
                    ,ROW_NUMBER() OVER      (
                                            PARTITION BY customer_id
                                            ORDER BY s_2 DESC
                                            ) last_statement_flag_drop
                    FROM TRAIN_DATA_random
                    )


                    SELECT *
                    ,CASE WHEN last_statement_flag_drop = 1 then 1 else 0 end as last_statement_flag
                    ,CASE WHEN (target = 1 AND last_statement_flag_drop = 1) then 1 else 0 end as last_statement_target
                    FROM BASE B
                    LEFT JOIN train_labels_random L
                    ON B.customer_id = L.customer_id
                 """, engine) 


# Testing positive flag on all statements, or only last statement

In [29]:
df = sql_df
df.head(5)
# Dropping columns because the dummy variables 
df = df.drop(labels=['d_63', 'd_64'], axis=1)
df.head()
df.shape[0]

NameError: name 'sql_df' is not defined

In [2]:
df = pd.read_feather(r'C:\Users\joebu\programming_directory\DSBA_6156_SERJ\ignore\train_data.feather')

df.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938477,0.001734,0.008728,1.006836,0.009224,0.124023,0.008774,0.004707,...,,,0.002426,0.003706,0.003819,,0.000569,0.00061,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936523,0.005775,0.004925,1.000977,0.006153,0.126709,0.000798,0.002714,...,,,0.003956,0.003166,0.005032,,0.009575,0.005493,0.009216,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.954102,0.091492,0.021652,1.009766,0.006817,0.123962,0.007599,0.009422,...,,,0.003269,0.007328,0.000427,,0.003429,0.006985,0.002604,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960449,0.002455,0.013687,1.00293,0.001372,0.117188,0.000685,0.005531,...,,,0.006119,0.004517,0.003201,,0.008423,0.006527,0.009598,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947266,0.002483,0.01519,1.000977,0.007607,0.11731,0.004654,0.009308,...,,,0.003672,0.004944,0.008888,,0.00167,0.008125,0.009827,0


In [3]:
df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(int)
df['last_statement_target'] = np.where((df['statement_num'] == 1) & (df['target'] == 1), 1, 0)
df.columns= df.columns.str.lower()
df = df.drop(labels=['d_63', 'd_64', 'd_68', 'b_30', 'd_117', 'd_120', 'd_126', 'b_38', 'd_114', 'd_116', 'd_42'], axis=1)

Do all imputation and categorical/numerical 

In [4]:
# Defining the categorical imputation and one-hot encoder for categorical variables.
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        #  ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)), #Commented out because the categorical variables won't play nice with dummies between test/train. Retry when we do a full train model. Can impute values on test_data.csv if necessary.
    ]
)

In [5]:
# defining the numerical imputation and standard scaler for numerical variables.
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

# Executing the full_processor in one-step for numerical and categorical pipelines.

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

## Define functions for data prep

In [6]:
# Preparing the TRAINING data for creating the model.
def prep_df(df, target, target_to_drop):
    # Set index
    df = df.loc[:,~df.columns.duplicated()]

    # Drop unecessary columns
    df = df.drop(columns=["customer_id", "statement_num", "s_2", target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    over_threshold


    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]
    
    X = pd.get_dummies(X, drop_first=True)
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()
    
    # Split categorical and numerical columns
    cat_cols = X.select_dtypes(exclude="number").columns
    num_cols = X.select_dtypes(include="number").columns
    
    full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
    )   
    
    # Apply preprocessing
    X_processed = full_processor.fit_transform(X)
    print(X_processed.shape)
    y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
            y.values.reshape(-1, 1)
            )
    return X_processed, y_processed, cols_list
        
    

In [7]:
# Preparing the test_data.csv so it's values can be fed into the built model.
def prep_test_df(df, keep_cols):
    
    # Handling case-sensitivity
    keep_cols = keep_cols
    # # Drop columns not used in model training
    
    # df = df[keep_cols]
    
    
    X = df
    
    # Split categorical and numerical columns
    cat_cols = X.select_dtypes(exclude="number").columns
    num_cols = X.select_dtypes(include="number").columns
    
    full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
    )
    X = X[keep_cols]
    # Apply preprocessing
    X_processed_test = full_processor.fit_transform(X)
    return X_processed_test

In [12]:
# Deprecated due to iterating over pre-split files.
# Now we just read in chunks from the entire test_data.csv file.

%%script false
def score_split_files(directory, model, keep_cols, test_data_col_names):
    mdf = pd.DataFrame(columns=['customer_id', 's_2', 'pred', 'proba'])
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            print("Working on " + f)
            df_pred = pd.read_csv(f)
            df_pred.columns = test_data_col_names
            X_processed_test = prep_test_df(df_pred, keep_cols=keep_cols)
            preds = model.predict(X_processed_test)
            proba = model.predict_proba(X_processed_test)
            df_c = df_pred[['customer_id', 's_2']]
            df_c = pd.concat([df_c, pd.DataFrame(preds, columns=['pred']), pd.DataFrame(proba, columns=['proba_inv', 'proba'])], axis=1)
            mdf = pd.concat([mdf, df_c])
            del [[df_c,df_pred]]
    return mdf

UsageError: Line magic function `%%script` not found.


In [13]:
# Feeding the test_data into the model, tabulating results, and building a df. Then saving the df to a .csv file.
def score_split_files(path, model, keep_cols, split_num_lines=3500000):
    current_position = 0 #defines starting position and keeps track of where in file to read
    df_columns = None #object to hold the col names collected from the first df chunk
    
    # Define the result mdf
    mdf = pd.DataFrame(columns=['customer_id', 's_2', 'pred', 'proba'])
    
    # Get chunks from the test_data.csv and send them to the model
    while True:
        try:
            df_chunk = pd.read_csv(path, skiprows=current_position, nrows=split_num_lines)
            df_chunk.columns = df_chunk.columns.str.lower()
            if current_position == 0:
                df_columns = df_chunk.columns
            else:
                df_chunk.columns = df_columns

            # Function to prep the test_data
            X_processed_test = prep_test_df(df_chunk, keep_cols=keep_cols)
            # Predicting outcomes from test_data
            preds = model.predict(X_processed_test)
            #Predicting probabilities from test_data
            proba = model.predict_proba(X_processed_test)
            # Creating df to concat later. Getting date and customer_id from original df read in from .csv
            df_c = df_chunk[['customer_id', 's_2']]
            # Concating the np arrays to df_c
            df_c = pd.concat([df_c, pd.DataFrame(preds, columns=['pred']), pd.DataFrame(proba, columns=['proba_inv', 'proba'])], axis=1)
            mdf = pd.concat([mdf, df_c])
            # Deleting the temp dfs to free up memory.
            del [[df_c,df_chunk]]

            current_position += split_num_lines #increments position by chunk size for the next loop
        except pd.errors.EmptyDataError:
            break
        
    
    
    
    return mdf

In [8]:
# Feeding the test_data into the model, tabulating results, and building a df. Then saving the df to a .csv file.
def score_feather_file(file, model, keep_cols, split_num_lines=3500000):
    df_columns = None #object to hold the col names collected from the first df chunk
    
    # Define the result mdf
    mdf = pd.DataFrame(columns=['customer_id', 's_2', 'pred', 'proba'])
    
    df_s = pd.read_feather(file)
    df_s.columns = df_s.columns.str.lower()
    # Function to prep the test_data
    X_processed_test = prep_test_df(df_s, keep_cols=keep_cols)
    # Predicting outcomes from test_data
    preds = model.predict(X_processed_test)
    #Predicting probabilities from test_data
    proba = model.predict_proba(X_processed_test)
    mdf = pd.concat([mdf, pd.DataFrame(preds, columns=['pred']), pd.DataFrame(proba, columns=['proba_inv', 'proba'])], axis=1)     
    
    return mdf

## Build the model with training data

In [33]:
# Prep the dataframe
X_processed, y_processed, cols_list = prep_df(df, target='target', target_to_drop='last_statement_target')

(120488, 155)


In [24]:
# Prepare the test/train split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, stratify=y_processed, random_state=rand_state
)

In [25]:

# Run the model

# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)
proba = xgb_cl.predict_proba(X_test)

# Score
accuracy_score(y_test, preds)

0.9218511387026094

# Predict test data, every statement flagged as target:

In [58]:
# Prep the dataframe
X_processed, y_processed, cols_list = prep_df(df, target='target', target_to_drop='last_statement_target')

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


(5531451, 13)


In [97]:
# Init classifier
xgb_cla = xgb.XGBClassifier()

# Fit
xgb_cla.fit(X_processed, y_processed)

In [59]:
path = r'C:\Users\joebu\programming_directory\DSBA_6156_SERJ\ignore\test_data.csv'
# test_data_col_names = pd.read_csv(r'C:\Users\joebu\programming_directory\large_data_files\amex-default-prediction\test_data.csv', nrows=0, index_col=False).columns.str.lower()


results_df = score_feather_file(path=path, model=xgb_cla, keep_cols=cols_list)
results_df.to_csv('./ignore/XGB_target.csv')

TypeError: score_feather_file() got an unexpected keyword argument 'path'

In [53]:
test_data = pd.read_feather(r'C:\Users\joebu\programming_directory\DSBA_6156_SERJ\ignore\test_data.feather')
display(test_data.info(verbose=True))
train_data = pd.read_feather(r'C:\Users\joebu\programming_directory\DSBA_6156_SERJ\ignore\train_data.feather')
print(train_data.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Data columns (total 190 columns):
 #    Column       Dtype         
---   ------       -----         
 0    customer_ID  object        
 1    S_2          datetime64[ns]
 2    P_2          float16       
 3    D_39         float16       
 4    B_1          float16       
 5    B_2          float16       
 6    R_1          float16       
 7    S_3          float16       
 8    D_41         float16       
 9    B_3          float16       
 10   D_42         float16       
 11   D_43         float16       
 12   D_44         float16       
 13   B_4          float16       
 14   D_45         float16       
 15   B_5          float16       
 16   R_2          float16       
 17   D_46         float16       
 18   D_47         float16       
 19   D_48         float16       
 20   D_49         float16       
 21   B_6          float16       
 22   B_7          float16       
 23   B_8          float16       


In [54]:
display(test_data.info(verbose=True))
display(train_data.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Data columns (total 190 columns):
 #    Column       Dtype         
---   ------       -----         
 0    customer_ID  object        
 1    S_2          datetime64[ns]
 2    P_2          float16       
 3    D_39         float16       
 4    B_1          float16       
 5    B_2          float16       
 6    R_1          float16       
 7    S_3          float16       
 8    D_41         float16       
 9    B_3          float16       
 10   D_42         float16       
 11   D_43         float16       
 12   D_44         float16       
 13   B_4          float16       
 14   D_45         float16       
 15   B_5          float16       
 16   R_2          float16       
 17   D_46         float16       
 18   D_47         float16       
 19   D_48         float16       
 20   D_49         float16       
 21   B_6          float16       
 22   B_7          float16       
 23   B_8          float16       


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Data columns (total 191 columns):
 #    Column       Dtype         
---   ------       -----         
 0    customer_ID  object        
 1    S_2          datetime64[ns]
 2    P_2          float16       
 3    D_39         float16       
 4    B_1          float16       
 5    B_2          float16       
 6    R_1          float16       
 7    S_3          float16       
 8    D_41         float16       
 9    B_3          float16       
 10   D_42         float16       
 11   D_43         float16       
 12   D_44         float16       
 13   B_4          float16       
 14   D_45         float16       
 15   B_5          float16       
 16   R_2          float16       
 17   D_46         float16       
 18   D_47         float16       
 19   D_48         float16       
 20   D_49         float16       
 21   B_6          float16       
 22   B_7          float16       
 23   B_8          float16       
 2

None

# Predict test data, last_staement_target as outcome:
Here will will only flag the last statement as default for each customer that did default.

In [9]:
# Prep the data with last_statement_target as outcome
print(df.info())
X_processed, y_processed, cols_list = prep_df(df, target='last_statement_target', target_to_drop='target')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Columns: 182 entries, customer_id to last_statement_target
dtypes: category(2), datetime64[ns](1), float16(175), int32(2), int8(1), object(1)
memory usage: 2.0+ GB
None


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


(5531451, 12)


In [11]:
# Train the model with new outcome
# Init classifier
xgb_cla = xgb.XGBClassifier()

# Fit
xgb_cla.fit(X_processed, y_processed)

import pickle
file_name = "xgb_full.pkl"

# save
pickle.dump(xgb_cla, open(file_name, "wb"))

In [12]:
path = r'C:\Users\joebu\programming_directory\DSBA_6156_SERJ\ignore\test_data.feather'
# test_data_col_names = pd.read_csv(r'C:\Users\joebu\programming_directory\large_data_files\amex-default-prediction\test_data.csv', nrows=0, index_col=False).columns.str.lower()

# Read the model in from the .pkl file
import pickle
file_name = "xgb_full.pkl"
xgb_cla = pickle.load(open(file_name, "rb"))

results_df = score_feather_file(file=path, model=xgb_cla, keep_cols=cols_list)
results_df.to_csv('./ignore/XGB_target.csv')

ValueError: A given column is not a column of the dataframe

# Only training on last statements:
Here we will only consider the last statements when training the model, and drop all the other statements.

In [None]:
# Filter on only the lastest statement per customer
ls_df = df[df['last_statement_flag'] == 1]
ls_df.shape[0]

10000

In [None]:
# Prep the data with last_statement_target as outcome
X_processed, y_processed, cols_list = prep_df(ls_df, target='last_statement_target', target_to_drop='target')

(10000, 156)


In [None]:
# Init classifier
xgb_cla = xgb.XGBClassifier()

# Fit the model
xgb_cla.fit(X_processed, y_processed)

In [None]:
results_df = score_split_files(path=path, model=xgb_cla, keep_cols=cols_list)
results_df.to_csv('./ignore/XGB_only_last_statements.csv')