In [103]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import datetime
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np


In [104]:
# class ImputeAsDataFrame(BaseEstimator, TransformerMixin):
#   def fit(self, X, y=None):
#     return self
#   def transform(self, df):
#     cat_cols = df.select_dtypes(exclude="number").columns
#     num_cols = df.select_dtypes(include="number").columns

#     for col in cat_cols:
#       df[col] = SimpleImputer(missing_values=None, strategy="most_frequent").fit_transform(df[[col]])
#     for col in num_cols:
#       df[col] = SimpleImputer(strategy="mean").fit_transform(df[[col]])

#     if df.isna().sum().sum() > 0:
#       print(f"WARNING: {df.isna().sum().sum()} nulls still exist after imputing.")
#     return df

In [105]:
#constants
rand_state = 1337
df_chunk_num_lines = 3500000 #number of lines you want in each df chunk - make this lower if you get a memory error

cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
for i in range(0,len(cat_cols)):
    cat_cols[i] = cat_cols[i].lower()


In [106]:
engine = create_engine('postgresql://user:DeEJNEAhy@34.75.124.150/postgres')

query = '''select train_data_random.*, train_labels_random.target
            from train_data_random
            inner join train_labels_random
            on train_data_random.customer_id = train_labels_random.customer_id;'''

df = pd.read_sql(query, engine)

for col in df[cat_cols]:
    print(df[col].unique())

[ 0.  2.  1. nan]
[ 3.  1.  2.  5.  4.  7.  6. nan]
[ 1. nan  0.]
[ 0. nan  1.]
[-1.  6.  4. nan  3.  5.  1.  2.]
[ 0. nan  1.]
[ 1.  0. nan -1.]
['CR' 'CO' 'CL' 'XZ' 'XM' 'XL']
['O' 'R' None 'U' '-1']
[nan  1.  0.]
[ 3.  4.  5. nan  6.  1.  2.  0.]


In [107]:
# class BackToDataFrame(BaseEstimator, TransformerMixin):
#   def __init__(self, cols=[]):
#     self.cols = cols
#   def fit(self, X, y=None):
#     return self
#   def fit_transform(self, X):
#     df = pd.DataFrame(X)
#     return df

In [108]:
#build list of columns with 50 percent missing values
percent_null = df.isnull().sum() / len(df) 
half_missing_cols = percent_null[percent_null > 0.5].index

In [109]:
def preprocess_data(df, label_cols=[], forced_cat_cols=[], drop_cols=[]):

  df = df.fillna(np.nan) #because SimpleImputer requires specification of the type of nan value, we use this generic to change all nan types to np.nan types
  df.columns= df.columns.str.lower()

  df[forced_cat_cols] = df[forced_cat_cols].astype(object)

  df = df.drop(columns=drop_cols)

  df_labels = df[label_cols]
  df = df.drop(label_cols,axis=1)

  cat_cols = df.select_dtypes(exclude="number").columns
  num_cols = df.select_dtypes(include="number").columns

  #impute num + cat
  for col in cat_cols:
    df[col] = SimpleImputer(strategy="most_frequent").fit_transform(df[[col]])
  for col in num_cols:
    df[col] = SimpleImputer(strategy="mean").fit_transform(df[[col]])
  if df.isna().sum().sum() > 0:
    print(f"WARNING: {df.isna().sum().sum()} nulls still exist after imputing.")

  #scale num
  for col in num_cols:
    df[col] = StandardScaler().fit_transform(df[[col]])

  df = pd.get_dummies(df)

  if len(label_cols)>0:
    return df, df_labels
  else:
    return df


In [110]:
#build model

y = df['target']
x = df.drop(columns=["target"])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=rand_state)

x_train, x_train_labels = preprocess_data(x_train, forced_cat_cols=cat_cols, label_cols=["customer_id","s_2"], drop_cols=half_missing_cols)


rf_all = RandomForestClassifier(random_state=rand_state)
rf_all.fit(x_train, y_train)

x_test, x_test_labels = preprocess_data(x_test, forced_cat_cols=cat_cols, label_cols=["customer_id","s_2"], drop_cols=half_missing_cols)

rf_all.score(x_test, y_test)

  uniques = Index(uniques)
  uniques = Index(uniques)


0.9141837496887708

In [111]:
rf_all_output = pd.DataFrame()

current_position = 0 #defines starting position and keeps track of where in file to read
df_columns = None #object to hold the col names collected from the first df chunk
while True:
    try:
        df_chunk = pd.read_csv(r'..\amex-default-prediction\test_data_last_statement.csv', skiprows=current_position, nrows=df_chunk_num_lines)
        df_chunk.columns = df_chunk.columns.str.lower() #convert all the column names to lowercase so that they match the column names from the database
        if current_position == 0:
            df_columns = df_chunk.columns
        else:
            df_chunk.columns = df_columns



        print(current_position)
        #you get DF chunks that you can do whatever with here

        df_chunk, df_chunk_labels = preprocess_data(df_chunk, forced_cat_cols=cat_cols, label_cols=["customer_id","s_2"], drop_cols=half_missing_cols)

        rf_all_prediction_chunk = rf_all.predict_proba(df_chunk)
        rf_all_prediction_chunk_output = pd.concat([df_chunk_labels,pd.DataFrame(rf_all_prediction_chunk,columns=["pred-inverse","prediction"])], axis=1)
        rf_all_prediction_chunk_output = rf_all_prediction_chunk_output.drop(columns=["pred-inverse"])
        
        rf_all_output = pd.concat([rf_all_output, rf_all_prediction_chunk_output],axis=1)

        current_position += df_chunk_num_lines #increments position by chunk size for the next loop
    except pd.errors.EmptyDataError:
        break

rf_all_output.to_csv(r"..\amex-default-prediction\rf_all_output.csv",index=False)

0


TypeError: preprocess_data() got an unexpected keyword argument 'dropcols'