In [143]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

In [144]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

def set_col_types(df):
    if "target" in df.columns:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68', 'target']
    else:
        categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    df['customer_ID'] = df['customer_ID'].astype("string")
    
    for col in categorical_cols:
        df[col] = df[col].astype("string")
    df["S_2"] = pd.to_datetime(df['S_2'], format=r'%Y-%m-%d').astype('datetime64[ns]')
    df["B_31"] = df["B_31"].astype(np.int8)
    return df

def sync_cols(train_df, pred_df):
    for col in train_df.columns:
      if col not in pred_df.columns:
        print(col, "not in pred_df so adding - should always be categorical!")
        pred_df[col] = 0
    for col in pred_df.columns:
      if col not in train_df.columns:
        print(col, "not in train_df so dropping")
        pred_df = pred_df.drop(col, axis=1)
    return pred_df

In [145]:
df = pd.read_parquet(r'../../../amex-default-prediction/train_data.parquet')
df = set_col_types(df)

#reduce df for development !!!!! comment out line below for final model
#df = df[:100000]

df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8) #statement_num - 1 is last statement
df['statement_num_reverse'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=True).astype(np.int8) #reverse  - 1 is first statement

In [146]:
#ok so 91 customers only have 1 statement. lets isolate them
cust_only_one = (df['customer_ID'].value_counts() == 1) #creates series of customers with T/F value if they only have 1 statement
cust_only_one = cust_only_one[cust_only_one == True].index #filters series to only the True and leaving behind only the customer_ID index


df_cust_one_statement = df[df['customer_ID'].isin(cust_only_one)] #creates df of customers with only 1 statement
df = df[~df['customer_ID'].isin(cust_only_one)] #creates df of customers with more than 1 statement

del cust_only_one

In [147]:
#lets export customers with one statement so we don't have to worry about them anymore - we will run them though the last statement RF model
df_cust_one_statement.to_parquet(r'cust_one_statement_train.parquet')
del df_cust_one_statement

#now all we have in memory is "df" with all customers with multiple statements time to reduce them to first + last

In [148]:
df_last = df[df['statement_num'] == 1]
df_first = df[df['statement_num_reverse'] == 1]

df_last.reset_index(inplace=True, drop=True) #reset indexes so the DFs line up with each other
df_first.reset_index(inplace=True, drop=True) #reset indexes so the DFs line up with each other

#test to make sure the indexes match up
count = 0
if len(df_last) == len(df_first):
    for x in range(0, len(df_last)):
        if df_last['customer_ID'].iloc[x] != df_first['customer_ID'].iloc[x]:
            count += 1
        if df_last['target'].iloc[x] != df_first['target'].iloc[x]: #also going to make sure target matches up for good measure
            count += 1
else:
    print("length mismatch")
print(count, "mismatches")

if count == 0: #extract customer_ID label and target if they match
    labels = df_last['customer_ID']
    targets = df_last['target']

0 mismatches


In [149]:
#lets make a simple function for some clean up steps before we combine them
def indenpendent_clean_up(df):
    #make columns for date components
    df["Month"] = df["S_2"].dt.month.astype(np.int8)
    df["Day"] = df["S_2"].dt.day.astype(np.int8)
    df["Year"] = df["S_2"].dt.year.astype(np.int16) #year should fit in int8, but it doens't just changing to int16
    df = df.drop(["S_2"], axis=1)

    cat_columns = df.select_dtypes(include=["string"]).columns
    num_columns = df.select_dtypes(include="number").columns

    # I'm gonna impute nans in categorical variables with the angzarr character because we know nulls have meaning and because the angzarr is amazing
    for col in cat_columns:
        df[col].fillna("⍼", inplace=True)

    #impute using mean for numeric
    for col in num_columns:
        df[col].fillna(df[col].mean(), inplace=True)

    df.drop(inplace=True, columns=['customer_ID','statement_num_reverse','target'])

    return df

#the dfs match up so lets recombine them
df_first = indenpendent_clean_up(df_first)
df_last = indenpendent_clean_up(df_last)

#add suffix to columns in their respective frames
df_first = df_first.add_suffix('_first')
df_last = df_last.add_suffix('_last')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Month"] = df["S_2"].dt.month.astype(np.int8)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Day"] = df["S_2"].dt.day.astype(np.int8)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"] = df["S_2"].dt.year.astype(np.int16) #year should fit in int8, but it doens't just changing to int16

In [150]:
df_comb = pd.concat([df_first, df_last], axis=1) #combine dfs
df_comb = df_comb.sort_index(axis=1) # sorting by col name so I can visually see the expected lag scenarios

In [151]:
df_comb.info(verbose=True) #just making sure our column types are still correct

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453793 entries, 0 to 453792
Data columns (total 384 columns):
 #    Column               Dtype  
---   ------               -----  
 0    B_10_first           float32
 1    B_10_last            float32
 2    B_11_first           float32
 3    B_11_last            float32
 4    B_12_first           float32
 5    B_12_last            float32
 6    B_13_first           float32
 7    B_13_last            float32
 8    B_14_first           float32
 9    B_14_last            float32
 10   B_15_first           float32
 11   B_15_last            float32
 12   B_16_first           float32
 13   B_16_last            float32
 14   B_17_first           float32
 15   B_17_last            float32
 16   B_18_first           float32
 17   B_18_last            float32
 18   B_19_first           float32
 19   B_19_last            float32
 20   B_1_first            float32
 21   B_1_last             float32
 22   B_20_first           float32
 23   B_20_la

In [152]:
#I think we have enough data to actually dummy code every possible transition of our categorical data
# - for example of course we don't want to treat categorical values as numeric even if they look that way
#   if statement goes from "1.0" to "2.0", I'm going to make a lag value of "1.0-2.0"
#   then we will dummy code all of these transitions
#   this will give the tree model a way to branch on that exact statement transition for the categories

cat_columns = df_comb.select_dtypes(include=["string"]).columns.to_list()
num_columns = df_comb.select_dtypes(include="number").columns.to_list()

#at this point the feature cols for the 2 frames should be identical so we're going to reduce our selection list to these features without the _last or _first

def get_base_cols(col_list):
    final_list = []
    for col in col_list:
        if "_last" in col:
            final_list.append(col.replace("_last", ""))
    return final_list

cat_base_columns = get_base_cols(cat_columns)
num_base_columns = get_base_cols(num_columns)

#ok. now we should have a list of base columns that exist for both first and last statement that we can use for selection by simply adding either _first or _last in our column querys

In [153]:
#time for the fun stuff - engineering our lag features

for col in cat_base_columns:
    df_comb[col + "_cat_change"] = df_comb[col + "_first"] + "to" + df_comb[col + "_last"]

for col in num_base_columns:
    df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
    df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]

df_comb = df_comb.sort_index(axis=1) # sorting again, so I can spot check for issues

  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "

In [154]:
#output seems correct so lets get dummies and fit a model
df_comb = pd.get_dummies(df_comb)


In [155]:
train_cols = df_comb.columns #get train columns to use later. going to overwrite all train data with test data so I want this for sync cols
#the sync cols function I made uses DFs so I'm just gonna make this into an empty DF
train_cols = pd.DataFrame(columns=train_cols)

In [156]:

#not going to do train / test because the amex submission is score should tell me what I want to know
model = LGBMClassifier()
model.fit(df_comb, targets)

In [157]:
#no train test, but still curious to see how we actually fit though
y_pred = model.predict(df_comb)
print(classification_report(targets, y_pred, digits=5)) #unsurprisingly a perfect fit with so many dimensions - hopefully not overfit through - submission to kaggle will tell us

              precision    recall  f1-score   support

           0    0.93680   0.93597   0.93638    336684
           1    0.81637   0.81845   0.81741    117109

    accuracy                        0.90564    453793
   macro avg    0.87658   0.87721   0.87690    453793
weighted avg    0.90572   0.90564   0.90568    453793



In [158]:
#what about with amex metric - just checking to make sure my high degree of fit checks out
amex_test = targets.to_frame()
amex_test['target'] = amex_test['target'].astype('int')

amex_pred = model.predict_proba(df_comb)
amex_pred = pd.DataFrame(amex_pred,columns=["proba-inv","proba"])
amex_pred = amex_pred.drop(columns=['proba-inv'])
amex_pred = amex_pred.rename(columns={"proba":"prediction"})

amex_metric(amex_test, amex_pred)

0.7992227928244657

In [159]:
df = pd.read_parquet(r'../../../amex-default-prediction/test_data.parquet')
df = set_col_types(df)

#reduce df for development !!!!! comment out line below for final model
#df = df[:100000]

df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8) #statement_num - 1 is last statement
df['statement_num_reverse'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=True).astype(np.int8) #reverse  - 1 is first statement

In [160]:
#ok so 91 customers only have 1 statement. lets isolate them
cust_only_one = (df['customer_ID'].value_counts() == 1) #creates series of customers with T/F value if they only have 1 statement
cust_only_one = cust_only_one[cust_only_one == True].index #filters series to only the True and leaving behind only the customer_ID index


df_cust_one_statement = df[df['customer_ID'].isin(cust_only_one)] #creates df of customers with only 1 statement
df = df[~df['customer_ID'].isin(cust_only_one)] #creates df of customers with more than 1 statement

del cust_only_one

In [161]:
#lets export customers with one statement so we don't have to worry about them anymore - we will run them though the last statement RF model
df_cust_one_statement.to_parquet(r'cust_one_statement_test.parquet')
del df_cust_one_statement

#now all we have in memory is "df" with all customers with multiple statements time to reduce them to first + last

In [162]:
df_last = df[df['statement_num'] == 1]
df_first = df[df['statement_num_reverse'] == 1]

df_last.reset_index(inplace=True, drop=True) #reset indexes so the DFs line up with each other
df_first.reset_index(inplace=True, drop=True) #reset indexes so the DFs line up with each other

#test to make sure the indexes match up
count = 0
if len(df_last) == len(df_first):
    for x in range(0, len(df_last)):
        if df_last['customer_ID'].iloc[x] != df_first['customer_ID'].iloc[x]:
            count += 1
else:
    print("length mismatch")
print(count, "mismatches")

if count == 0: #extract customer_ID label and target if they match
    labels = df_last['customer_ID']

0 mismatches


In [163]:
#lets make a simple function for some clean up steps before we combine them
def indenpendent_clean_up(df):
    #make columns for date components
    df["Month"] = df["S_2"].dt.month.astype(np.int8)
    df["Day"] = df["S_2"].dt.day.astype(np.int8)
    df["Year"] = df["S_2"].dt.year.astype(np.int16) #year should fit in int8, but it doens't just changing to int16
    df = df.drop(["S_2"], axis=1)

    cat_columns = df.select_dtypes(include=["string"]).columns
    num_columns = df.select_dtypes(include="number").columns

    # I'm gonna impute nans in categorical variables with the angzarr character because we know nulls have meaning and because the angzarr is amazing
    for col in cat_columns:
        df[col].fillna("⍼", inplace=True)

    #impute using mean for numeric
    for col in num_columns:
        df[col].fillna(df[col].mean(), inplace=True)

    df.drop(inplace=True, columns=['customer_ID','statement_num_reverse'])

    return df

#the dfs match up so lets recombine them
df_first = indenpendent_clean_up(df_first)
df_last = indenpendent_clean_up(df_last)

#add suffix to columns in their respective frames
df_first = df_first.add_suffix('_first')
df_last = df_last.add_suffix('_last')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Month"] = df["S_2"].dt.month.astype(np.int8)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Day"] = df["S_2"].dt.day.astype(np.int8)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"] = df["S_2"].dt.year.astype(np.int16) #year should fit in int8, but it doens't just changing to int16

In [164]:
df_comb = pd.concat([df_first, df_last], axis=1) #combine dfs
df_comb = df_comb.sort_index(axis=1) # sorting by col name so I can visually see the expected lag scenarios

In [165]:
#I think we have enough data to actually dummy code every possible transition of our categorical data
# - for example of course we don't want to treat categorical values as numeric even if they look that way
#   if statement goes from "1.0" to "2.0", I'm going to make a lag value of "1.0-2.0"
#   then we will dummy code all of these transitions
#   this will give the tree model a way to branch on that exact statement transition for the categories

cat_columns = df_comb.select_dtypes(include=["string"]).columns.to_list()
num_columns = df_comb.select_dtypes(include="number").columns.to_list()

#at this point the feature cols for the 2 frames should be identical so we're going to reduce our selection list to these features without the _last or _first

cat_base_columns = get_base_cols(cat_columns)
num_base_columns = get_base_cols(num_columns)

#ok. now we should have a list of base columns that exist for both first and last statement that we can use for selection by simply adding either _first or _last in our column querys

In [166]:
#time for the fun stuff - engineering our lag features

for col in cat_base_columns:
    df_comb[col + "_cat_change"] = df_comb[col + "_first"] + "to" + df_comb[col + "_last"]

for col in num_base_columns:
    df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
    df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]

df_comb = df_comb.sort_index(axis=1) # sorting again, so I can spot check for issues

  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "_first"] - df_comb[col + "_last"]
  df_comb[col + "_num_div"] = df_comb[col + "_first"] / df_comb[col + "_last"]
  df_comb[col + "_num_diff"] = df_comb[col + "

In [167]:
#output seems correct so lets get dummies and fit a model
df_comb = pd.get_dummies(df_comb)

In [168]:
df_comb = sync_cols(train_cols, df_comb)

# Make predictions on the test data
submission_pred = model.predict(df_comb)
submission_proba = model.predict_proba(df_comb)

D_114_cat_change_1.0to⍼ not in pred_df so adding - should always be categorical!
D_117_cat_change_1.0to⍼ not in pred_df so adding - should always be categorical!
D_117_cat_change_2.0to⍼ not in pred_df so adding - should always be categorical!
D_117_cat_change_3.0to⍼ not in pred_df so adding - should always be categorical!
D_117_cat_change_4.0to⍼ not in pred_df so adding - should always be categorical!
D_117_cat_change_5.0to⍼ not in pred_df so adding - should always be categorical!
D_117_cat_change_6.0to⍼ not in pred_df so adding - should always be categorical!
D_120_cat_change_1.0to⍼ not in pred_df so adding - should always be categorical!
D_64_cat_change_-1toO not in pred_df so adding - should always be categorical!
D_64_cat_change_-1toR not in pred_df so adding - should always be categorical!
D_64_cat_change_-1toU not in pred_df so adding - should always be categorical!
D_64_cat_change_-1to⍼ not in pred_df so adding - should always be categorical!
D_64_first_-1 not in pred_df so addi

In [169]:
# Add the labels to the predictions
prediction_output = pd.concat([labels,pd.DataFrame(submission_pred,columns=["pred"]),pd.DataFrame(submission_proba,columns=["proba-inv","proba"])], axis=1)

prediction_output = prediction_output.drop(columns=['proba-inv','pred'])
prediction_output = prediction_output.rename(columns={"proba":"prediction"})

prediction_output.to_csv("lgbm-lag-plusone-statement-output.csv", index=False)

In [None]:
#now lets handle the customers with only 1 statement
#first we're going to train on the full dataset for last statement
#then we'll predict on only the customer last statements

In [170]:
# Read data from parquet file
df = pd.read_parquet(r"../../../amex-default-prediction/train_data.parquet")
#reduce df for development !!!!! comment out line below for final model
#df = df[:100000]

# Set the data types for the columns
df = set_col_types(df)

#engineer statement num
df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8) 
df = df[df["statement_num"] == 1]
df.reset_index(inplace=True)
#engineer date cols
df["Month"] = df["S_2"].dt.month
df["Day"] = df["S_2"].dt.day
df["Year"] = df["S_2"].dt.year
df = df.drop(["S_2"], axis=1)

# Separate target variable and feature columns
target = df["target"]
labels = df['customer_ID']
features = df.drop(["customer_ID", "target"], axis=1)

# Impute missing values using mode for categorical columns and median for numerical columns
cat_columns = features.select_dtypes(include=["string"]).columns
num_columns = features.select_dtypes(include="number").columns

# Replace missing values in the categorical columns with the most frequent value
# for col in cat_columns:
#     features[col].fillna("NA", inplace=True)

# Replace missing values in the numerical columns with the median value
for col in num_columns:
    features[col].fillna(features[col].mean(), inplace=True)

features = pd.get_dummies(features, dummy_na=True)

# Train a LightGBM model on the training data
model = LGBMClassifier()
model.set_params(**{'bagging_fraction': 0.9212862922324698, 'bagging_freq': 6, 'feature_fraction': 0.5071870813127417, 'learning_rate': 0.09393773349524241, 'num_leaves': 82, 'reg_alpha': 0.8316783754122955, 'reg_lambda': 0.1695940899747675})
model.fit(features, target)



In [174]:
# Read test data from parquet file
df = pd.read_parquet(r"cust_one_statement_test.parquet")
#df = df[:100000]

# Set the data types for the columns
df = set_col_types(df)

# Engineer statement num
df['statement_num'] = df.groupby("customer_ID")['S_2'].rank(method='first', ascending=False).astype(np.int8)
df = df[df["statement_num"] == 1]
df.reset_index(inplace=True)
# Engineer date cols
df["Month"] = df["S_2"].dt.month
df["Day"] = df["S_2"].dt.day
df["Year"] = df["S_2"].dt.year
df = df.drop(["S_2"], axis=1)

# Separate labels and feature columns
labels = df['customer_ID']
features_test = df.drop(["customer_ID"], axis=1)

# Impute missing values using mode for categorical columns and median for numerical columns
cat_columns = features_test.select_dtypes(include=["string"]).columns
num_columns = features_test.select_dtypes(include="number").columns

# Replace missing values in the categorical columns with the most frequent value
# for col in cat_columns:
#     features_test[col].fillna("NA", inplace=True)

# Replace missing values in the numerical columns with the median value
for col in num_columns:
    features_test[col].fillna(features_test[col].mean(), inplace=True)

features_test = pd.get_dummies(features_test, dummy_na=True)

features_test = sync_cols(features, features_test)

# Make predictions on the test data
submission_pred_one_statement_pred = model.predict(features_test)
submission_pred_one_statement_proba = model.predict_proba(features_test)

statement_num_reverse not in train_df so dropping


In [177]:
# Add the labels to the predictions
prediction_output_one_statement = pd.concat([labels,pd.DataFrame(submission_pred_one_statement_pred,columns=["pred"]),pd.DataFrame(submission_pred_one_statement_proba,columns=["proba-inv","proba"])], axis=1)

prediction_output_one_statement = prediction_output_one_statement.drop(columns=['proba-inv','pred'])
prediction_output_one_statement = prediction_output_one_statement.rename(columns={"proba":"prediction"})

lightgbm_lag_final_output = pd.concat([prediction_output, prediction_output_one_statement], axis=0)

lightgbm_lag_final_output.to_csv("lgbm-lag-final_output.csv", index=False)