In [1]:
from mlc.cashflow import ScorableModelTemplate, compute_score
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import os

# Standard library
import warnings
import ast
from pathlib import Path
from datetime import timedelta
from abc import ABC, abstractmethod
from collections import Counter

# Third-party libraries
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, skew, mannwhitneyu, kurtosis, entropy
from joblib import Parallel, delayed
import xgboost as xgb

# scikit-learn
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    RepeatedStratifiedKFold,
    GridSearchCV,
    cross_val_score
)
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    make_scorer
)

from Final_workflow.c04_script import model_4
from Final_workflow.c02_script import model_2
from Final_workflow.c03 import features_c03, c03_predictions
from Final_workflow.c01_script import model_1

# Suppress warnings
warnings.filterwarnings("ignore")


import sys
init_dir = os.getcwd()

class ScorableModel(ScorableModelTemplate):
    def predict(self, raw_consumer_file: str, raw_transactions_file: str):
        """Predict labels and positions of bugs.

        :param raw_consumer_file: path to consumer_data.parquet
        :param raw_consumer_file: path to transactions.parquet
        """ 
        loader = self.process_inputs(raw_consumer_file, raw_transactions_file)
        consumer_df = pd.read_parquet(raw_consumer_file)
        placeholder = consumer_df[['masked_consumer_id']]
        predictions = []

        for i in range(1,5):
            if i == 4 and len(consumer_df[consumer_df['masked_consumer_id'].str[2] == str(4)]) != 0:
                c04_results = consumer_df[consumer_df['masked_consumer_id'].str[2] == str(4)]
                transactions_4 = loader[loader['masked_consumer_id'].str[2] == '4']
                pred = model_4(transactions_4)
                pred_df = pd.DataFrame({'masked_consumer_id': pred.index, 'y_pred': pred.values}).reset_index(drop=True)
                merged = c04_results.merge(pred_df, on='masked_consumer_id', how='left').fillna(0.5)
                predictions.append(merged[['masked_consumer_id', 'y_pred']])
            if i == 3 and len(consumer_df[consumer_df['masked_consumer_id'].str[2] == str(3   )]) != 0:
                features = features_c03(loader, raw_consumer_file)
                c03_results, features = features[['masked_consumer_id']], features.drop(columns = ['masked_consumer_id'])
                c03_results['y_pred'] = c03_predictions(features)
                predictions.append(c03_results)
            if i == 2 and len(consumer_df[consumer_df['masked_consumer_id'].str[2] == str(2)]) != 0:
                c02_results = consumer_df[consumer_df['masked_consumer_id'].str[2] == str(2)]
                transactions_2 = loader[loader['masked_consumer_id'].str[2] == '2']
                pred = model_2(transactions_2, c02_results)
                pred_df = pd.DataFrame({'masked_consumer_id': pred.index, 'y_pred': pred.values}).reset_index(drop=True)
                merged = c02_results.merge(pred_df, on='masked_consumer_id', how='left').fillna(0.5)
                predictions.append(merged[['masked_consumer_id', 'y_pred']])
            if i == 1 and len(consumer_df[consumer_df['masked_consumer_id'].str[2] == str(1)]) != 0:
                c01_results = consumer_df[consumer_df['masked_consumer_id'].str[2] == str(1)]
                transactions_1 = loader[loader['masked_consumer_id'].str[2] == '1']
                pred = model_1(transactions_1, c01_results)
                pred_df = pd.DataFrame({'masked_consumer_id': pred.index, 'y_pred': pred.values}).reset_index(drop=True)
                merged = c01_results.merge(pred_df, on='masked_consumer_id', how='left').fillna(0.5)
                predictions.append(merged[['masked_consumer_id', 'y_pred']])

        all_pred = pd.concat(predictions)
        end_result = placeholder.merge(all_pred, on = 'masked_consumer_id', how='left')
        pred = end_result['y_pred'].values

        return pred

    def process_inputs(self, raw_consumer_file: str, raw_transactions_file: str):
        """Input argument will vary. See you competition's template.

        :param raw_files: list of file path strings, depends on competition
        :return: anything needed for you model to make predictions, e.g. features or processed data
        """
        # Read in files
        df_consumer = pd.read_parquet(raw_consumer_file)
        df_transactions = pd.read_parquet(raw_transactions_file)

        # Merge and filter to get only valid data
        merged_df = df_consumer.merge(df_transactions, on = "masked_consumer_id", how = 'left')
        filtered_df = merged_df[merged_df["posted_date"]< merged_df["evaluation_date"]]

        # Clean evaluation_date and create evaluation_day for day of week of evaluation_date
        filtered_df['evaluation_date'] = pd.to_datetime(filtered_df['evaluation_date'])
        filtered_df['evaluation_day'] = filtered_df['evaluation_date'].dt.dayofweek



        # Clean posted_date and create posted_day for day of week of posted_date
        filtered_df['posted_date'] = pd.to_datetime(filtered_df['posted_date'])
        filtered_df['posted_day'] = filtered_df['posted_date'].dt.dayofweek

        # Get loan category from masked_consumer_id
        filtered_df['loan_category'] = filtered_df['masked_consumer_id'].str[2].astype(int)
        filtered_df['converted_date'] = (max(filtered_df['posted_date']) - filtered_df['posted_date']).dt.total_seconds()/3600
        
        return filtered_df


    
    
# Intialize, runs: __check_rep__ to validate class
model = ScorableModel() # error will be raised if the above is not implemented correctly

In [2]:
_, true_output = model.load_test_case()
predicted_output = model.predict("mlc/test_data/consumer_data.parquet", "mlc/test_data/transactions.parquet")

In [3]:
score = compute_score(true_output, predicted_output)
print(score)

1.0


In [4]:
def compute_score_2(df_consumer: pd.DataFrame, y_pred: np.ndarray):

    df_consumer['group_id'] = df_consumer['masked_consumer_id'].str[:3]
    df_consumer['y_pred'] = y_pred
    return [roc_auc_score(df["FPF_TARGET"], df["y_pred"]) for _, df in df_consumer.groupby('group_id')]

compute_score_2(true_output, predicted_output)

[1.0]