# Exploring how to flag the target for the data

In [211]:
# Dependencies:
# Data Wrangling:
import pandas as pd
from sqlalchemy import create_engine

# Modeling Packages:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer

import xgboost as xgb
import datetime

import os
import glob
rand_state = 1337

In [365]:
# PostgreSQL cloud server credentials:
# server ip: 34.75.124.150
# username: user
# password: DeEJNEAhy
# Data is in materialized views train_data and train_labels

# Sample Python code to load a full table from the dataframe:

engine = create_engine('postgresql://user:DeEJNEAhy@34.75.124.150/postgres')
mdf = pd.read_sql("""
                 WITH BASE AS (
                    SELECT *
                    ,ROW_NUMBER() OVER      (
                                            PARTITION BY customer_id 
                                            ORDER BY s_2
                                            )
                    ,ROW_NUMBER() OVER      (
                                            PARTITION BY customer_id
                                            ORDER BY s_2 DESC
                                            ) last_statement_flag_drop
                    FROM TRAIN_DATA_random
                    )


                    SELECT *
                    ,CASE WHEN last_statement_flag_drop = 1 then 1 else 0 end as last_statement_flag
                    ,CASE WHEN (target = 1 AND last_statement_flag_drop = 1) then 1 else 0 end as last_statement_target
                    FROM BASE B
                    LEFT JOIN train_labels_random L
                    ON B.customer_id = L.customer_id
                 """, engine) 


# Testing positive flag on all statements, or only last statement

In [401]:
df = mdf
df.head(5)
df = df.drop(labels=['d_63', 'd_64'], axis=1)

Do all imputation and categorical/numerical 

In [402]:

# from sklearn.impute import SimpleImputer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder

categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent"))
        # ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

In [403]:

# from sklearn.preprocessing import StandardScaler

numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

In [404]:

# from sklearn.compose import ColumnTransformer

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

## Define functions for data prep

In [405]:
def prep_df(df, target, target_to_drop):
    # Set index
    df = df.loc[:,~df.T.duplicated(keep='first')]
    
    # Drop unecessary columns
    df = df.drop(columns=["customer_id", "row_number","last_statement_flag_drop","last_statement_flag", "s_2", target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    over_threshold


    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]
    
    X = pd.get_dummies(X, drop_first=True)
    cols_list = X.columns.tolist()
    
    # Split categorical and numerical columns
    cat_cols = X.select_dtypes(exclude="number").columns
    num_cols = X.select_dtypes(include="number").columns
    
    full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
    )   
    
    # Apply preprocessing
    X_processed = full_processor.fit_transform(X)
    print(X_processed.shape)
    y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
            y.values.reshape(-1, 1)
            )
    return X_processed, y_processed, cols_list
        
    

In [406]:
def prep_test_df(df, keep_cols):
       
    # # Drop columns not used in model training
    df = df[keep_cols]
    df = pd.get_dummies(df, drop_first=True)
    
    X = df
    
    # Split categorical and numerical columns
    cat_cols = X.select_dtypes(exclude="number").columns
    num_cols = X.select_dtypes(include="number").columns
    
    full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
    )
    
    # Apply preprocessing
    X_processed_test = full_processor.fit_transform(X)
    return X_processed_test

## Run the model against a test/train split

In [407]:
# Prep the dataframe
X_processed, y_processed, cols_list = prep_df(df, target='target', target_to_drop='last_statement_target')

(120488, 155)


In [408]:
# Prepare the test/train split
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, stratify=y_processed, random_state=rand_state
)

In [409]:

# Run the model

# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)
proba = xgb_cl.predict_proba(X_test)

# Score
accuracy_score(y_test, preds)

0.9218511387026094

# Predict test data:

In [410]:
# Init classifier
xgb_cla = xgb.XGBClassifier()

# Fit
xgb_cla.fit(X_processed, y_processed)

In [458]:
def score_split_files(directory, model, keep_cols, test_data_col_names):
    mdf = pd.DataFrame(columns=['customer_id', 's_2', 'pred', 'proba'])
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
            print("Working on " + f)
            df_pred = pd.read_csv(f)
            df_pred.columns = test_data_col_names
            display(df_pred.head(5))
            print(df_pred.shape)
            X_processed_test = prep_test_df(df_pred, keep_cols=keep_cols)
            print(X_processed_test.shape)
            preds = model.predict(X_processed_test)
            proba = model.predict_proba(X_processed_test)
            df_c = df_pred[['customer_id', 's_2']]
            df_c = pd.concat([df_c, pd.DataFrame(preds, columns=['pred']), pd.DataFrame(proba, columns=['proba_inv', 'proba'])], axis=1)
            display(df_c.head(20))
            mdf = pd.concat([mdf, df_c])
            del [[df_c,df_pred]]
    return mdf

In [460]:
directory = r'C:\Users\joebu\programming_directory\large_data_files\amex-default-prediction\split_files'
test_data_col_names = pd.read_csv(r'C:\Users\joebu\programming_directory\large_data_files\amex-default-prediction\test_data.csv', nrows=0, index_col=False).columns.str.lower()


results_df = score_split_files(directory, model=xgb_cla, keep_cols=cols_list, test_data_col_names=test_data_col_names)
results_df.to_csv('testytest.csv')

Working on C:\Users\joebu\programming_directory\large_data_files\amex-default-prediction\split_files\test_data_0.csv
