In [11]:
# Dependencies:
# Data Wrangling:
import pandas as pd
from sqlalchemy import create_engine

# Modeling Packages:
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer

import xgboost as xgb
import datetime

import os
import glob
rand_state = 1337

In [None]:
# PostgreSQL cloud server credentials:
# server ip: 34.75.124.150
# username: user
# password: DeEJNEAhy
# Data is in materialized views train_data and train_labels
engine = create_engine('postgresql://user:DeEJNEAhy@34.75.124.150/postgres')
sql_df = pd.read_sql("""
                 WITH BASE AS (
                    SELECT *
                    ,ROW_NUMBER() OVER      (
                                            PARTITION BY customer_id 
                                            ORDER BY s_2
                                            )
                    ,ROW_NUMBER() OVER      (
                                            PARTITION BY customer_id
                                            ORDER BY s_2 DESC
                                            ) last_statement_flag_drop
                    FROM TRAIN_DATA_random
                    )


                    SELECT *
                    ,CASE WHEN last_statement_flag_drop = 1 then 1 else 0 end as last_statement_flag
                    ,CASE WHEN (target = 1 AND last_statement_flag_drop = 1) then 1 else 0 end as last_statement_target
                    FROM BASE B
                    LEFT JOIN train_labels_random L
                    ON B.customer_id = L.customer_id
                 """, engine) 


In [3]:
# df = sql_df
df = pd.read_csv('df_savepoint.csv')
df.head(5)
# Dropping columns because the dummy variables 
# df = df.drop(labels=['d_63', 'd_64'], axis=1)
df.head()
df.shape[0]

2652099

In [15]:
# Defining the categorical imputation and one-hot encoder for categorical variables.
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent"))
        # ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)), #Commented out because the categorical variables won't play nice with dummies between test/train. Retry when we do a full train model. Can impute values on test_data.csv if necessary.
    ]
)

In [16]:
# defining the numerical imputation and standard scaler for numerical variables.
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

## Define functions for data prep

In [17]:
# Preparing the TRAINING data for creating the model.
def prep_df(df, target, target_to_drop):
    # Set index
    df = df.loc[:,~df.columns.duplicated()]

    # Drop unecessary columns
#     df = df.drop(columns=["customer_id", "row_number","last_statement_flag_drop","last_statement_flag", "s_2", target_to_drop])

    # Missing values handling
    missing_props = df.isna().mean(axis=0)
    

    over_threshold = missing_props[missing_props >= 0.4]
    over_threshold


    df.drop(over_threshold.index, 
            axis=1, 
            inplace=True)

    
    
    # Split into predictors and target
    X = df.drop(columns=[target])
    y = df[target]
    
    X = pd.get_dummies(X, drop_first=True)
    X.columns = X.columns.str.lower()
    cols_list = X.columns.tolist()
    
    # Split categorical and numerical columns
    cat_cols = X.select_dtypes(exclude="number").columns
    num_cols = X.select_dtypes(include="number").columns
    
    full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
    )   
    
    # Apply preprocessing
    X_processed = full_processor.fit_transform(X)
    print(X_processed.shape)
    y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
            y.values.reshape(-1, 1)
            )
    return X_processed, y_processed, cols_list
        
    

In [18]:
# Preparing the test_data.csv so it's values can be fed into the built model.
def prep_test_df(df, keep_cols):
    
    # Handling case-sensitivity
    keep_cols = keep_cols
    # # Drop columns not used in model training
    df = df[keep_cols]
    df = pd.get_dummies(df, drop_first=True)
    
    X = df
    
    # Split categorical and numerical columns
    cat_cols = X.select_dtypes(exclude="number").columns
    num_cols = X.select_dtypes(include="number").columns
    
    full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
    )
    
    # Apply preprocessing
    X_processed_test = full_processor.fit_transform(X)
    return X_processed_test

In [19]:
# Feeding the test_data into the model, tabulating results, and building a df. Then saving the df to a .csv file.
def score_split_files(path, model, keep_cols, split_num_lines=3500000):
    current_position = 0 #defines starting position and keeps track of where in file to read
    df_columns = None #object to hold the col names collected from the first df chunk
    
    # Define the result mdf
    mdf = pd.DataFrame(columns=['customer_id', 's_2', 'pred', 'proba'])
    
    # Get chunks from the test_data.csv and send them to the model
    while True:
        try:
            df_chunk = pd.read_csv(path, skiprows=current_position, nrows=split_num_lines)
            df_chunk.columns = df_chunk.columns.str.lower()
            if current_position == 0:
                df_columns = df_chunk.columns
            else:
                df_chunk.columns = df_columns

            # Function to prep the test_data
            # keep_cols_new = keep_cols.remove['last_statement_target']
            X_processed_test = prep_test_df(df_chunk, keep_cols=keep_cols)
            # Predicting outcomes from test_data
            preds = model.predict(X_processed_test)
            #Predicting probabilities from test_data
            proba = model.predict_proba(X_processed_test)
            # Creating df to concat later. Getting date and customer_id from original df read in from .csv
            df_c = df_chunk[['customer_id', 's_2']]
            # Concating the np arrays to df_c
            df_c = pd.concat([df_c, pd.DataFrame(preds, columns=['pred']), pd.DataFrame(proba, columns=['proba_inv', 'proba'])], axis=1)
            mdf = pd.concat([mdf, df_c])
            # Deleting the temp dfs to free up memory.
            del [[df_c,df_chunk]]

            current_position += split_num_lines #increments position by chunk size for the next loop
        except pd.errors.EmptyDataError:
            break
        
    
    
    
    return mdf

In [9]:
# Read in the column names we get from feature reduction
import pickle
with open("C:/Users/joebu/programming_directory/DSBA_6156_SERJ/data/names", "rb") as fp:   # Unpickling
    names = pickle.load(fp)

print(names)

names = names.tolist()
names.append('last_statement_target')
print(names)
remove_list = ['d_66_1', 'b_38_4', 'd_64_0', 'd_63_3', 'b_38_2', 'd_117_6', 'd_120_0', 'd_117_3']
remove_list_2 = ['last_statement_target']
names_pre = [i for i in names if i not in remove_list]
names_pre_no_t = [i for i in names_pre if i not in remove_list_2]

# df = df[names_pre]

print(names_pre_no_t)

0        p_2
2        b_1
31       r_4
9       d_44
21      d_51
       ...  
60      s_15
46      d_65
153    d_141
44      s_11
89      s_18
Name: name, Length: 68, dtype: object
['p_2', 'b_1', 'r_4', 'd_44', 'd_51', 'b_2', 'r_1', 'b_9', 'd_66_1', 'd_41', 'r_2', 'd_39', 'd_129', 's_3', 'r_3', 'd_112', 'b_38_4', 'd_64_0', 'd_111', 'b_4', 'd_43', 'r_27', 'd_49', 's_23', 'b_8', 'd_45', 'b_3', 'd_46', 'd_48', 'd_54', 'd_131', 'd_63_3', 'b_11', 'd_140', 'b_38_2', 'b_5', 'r_11', 'b_7', 'r_7', 'd_117_6', 'd_138', 'p_3', 'b_10', 'b_20', 'r_26', 'b_16', 'r_5', 'd_120_0', 'd_62', 's_7', 'r_12', 'd_72', 'd_117_3', 's_24', 'd_75', 'd_82', 'b_18', 's_5', 'd_52', 'd_86', 's_8', 'd_47', 's_26', 's_15', 'd_65', 'd_141', 's_11', 's_18', 'last_statement_target']
['p_2', 'b_1', 'r_4', 'd_44', 'd_51', 'b_2', 'r_1', 'b_9', 'd_41', 'r_2', 'd_39', 'd_129', 's_3', 'r_3', 'd_112', 'd_111', 'b_4', 'd_43', 'r_27', 'd_49', 's_23', 'b_8', 'd_45', 'b_3', 'd_46', 'd_48', 'd_54', 'd_131', 'b_11', 'd_140', 'b_5', 'r

In [10]:
df.to_csv('df_savepoint.csv')

In [11]:
# Prep the data with last_statement_target as outcome
X_processed, y_processed, cols_list = prep_df(df, target='last_statement_target', target_to_drop='target')

(2652099, 56)


In [13]:
# Train the model with new outcome
# Init classifier
xgb_cla = xgb.XGBClassifier()

# Fit
xgb_cla.fit(X_processed, y_processed)

import pickle
file_name = "xgb_reg.pkl"

# save
pickle.dump(xgb_cla, open(file_name, "wb"))

In [3]:
# Read the model in from the .pkl file
import pickle
file_name = "xgb_reg.pkl"
xgb_cla = pickle.load(open(file_name, "rb"))

In [20]:
path = r'C:\Users\joebu\programming_directory\DSBA_6156_SERJ\ignore\test_data.csv'
results_df = score_split_files(path=path, model=xgb_cla, keep_cols=names_pre_no_t)
results_df.to_csv('./ignore/XGB_last_statement_target.csv')

ValueError: Feature shape mismatch, expected: 56, got 60