In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

ROOT = Path('/home/sabina.jangirova/Documents/ML703_project/data')

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder

In [2]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date,strict=False))
        return df

    def handle_dates(df):
        for col in df.columns: 
                if col.endswith("D"):
                    # Calculate the difference in days between each date column and date_decision
                    df = df.with_columns(
                        (pl.col("date_decision") - pl.col(col)).dt.total_days().alias(col)
                    )
                    df = df.with_columns(pl.col(col).fill_null(np.nan)) 
        # Drop date_decision column
        df = df.drop("date_decision")
#         print(df.dtypes) # for Debugging
        return df

    def filter_cols(df,base_df = None,test=False):
        #for test data
            for col in df.columns:
                if col not in ["target", "case_id", "WEEK_NUM"]:
                    isnull = df[col].is_null().mean()
                    if isnull > 0.7:
                        df = df.drop(col)
            columns_to_drop = []
            for col in df.columns:
                if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                    freq = df[col].n_unique()
                    if (freq == 1) or (freq > 100):
                        columns_to_drop.append(col)

            df = df.drop(columns_to_drop)
            return df


class Aggregator:
    
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)
        return exprs

def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.set_table_dtypes)
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                try:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
                except:
                    continue
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

In [4]:
%%time
data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

CPU times: user 4min 18s, sys: 2min 26s, total: 6min 44s
Wall time: 33.3 s


In [5]:
df = feature_eng(**data_store) # import train data 
print("train data shape:\t", df.shape)
# gc.collect()
# spamming gc.collect praying for memory to not full
gc.collect()
df = df.pipe(Pipeline.filter_cols) # fillter column
gc.collect()
df, cat_cols = to_pandas(df) # tranform to pandas dataframe, easier to work with
gc.collect()
df = reduce_mem_usage(df) # as the name said
gc.collect()
print("train data shape:\t", df.shape)
nums=df.select_dtypes(exclude='category').columns
# IDK what is going on for now
from itertools import combinations, permutations
#df=df[nums]
nans_df = df[nums].isna()
nans_groups={}
for col in nums:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group]=[col]
del nans_df; x=gc.collect()

def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = df[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
            #print(str(gg)+'-'+str(n),', ',end='')
        use.append(vx)
        #print()
    print('Use these',use)
    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    correlation_matrix = matrix.corr()
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    return groups

uses=[]
for k,v in nans_groups.items():
    if len(v)>1:
            Vs = nans_groups[k]
            #cross_features=list(combinations(Vs, 2))
            #make_corr(Vs)
            grps= group_columns_by_correlation(df[Vs], threshold=0.8)
            use=reduce_group(grps)
            uses=uses+use
            #make_corr(use)
    else:
        uses=uses+v
    print('####### NAN count =',k)
print(uses)
print(len(uses))
uses=uses+list(df.select_dtypes(include='category').columns)
print(len(uses))
df=df[uses]

train data shape:	 (1526659, 488)
Memory usage of dataframe is 3313.73 MB
Memory usage after optimization is: 1097.80 MB
Decreased by 66.9%
train data shape:	 (1526659, 344)
Use these ['case_id', 'WEEK_NUM', 'target', 'month_decision', 'weekday_decision', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt_1022L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'isbidproduct_1095L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'max_mainoccupationinc_384A', 'max_birth_259D

## Feature engineering

In [6]:
gc.collect()

0

In [7]:
import networkx as nx
import seaborn as sns
import dowhy.gcm as gcm
from castle.algorithms import PC
from castle.algorithms.pc.pc import find_skeleton
from castle.common import GraphDAG
from castle.metrics import MetricsDAG

2024-05-10 16:14:07,056 - /home/sabina.jangirova/.conda/envs/ai701/lib/python3.8/site-packages/castle/backend/__init__.py[line:36] - INFO: You can use `os.environ['CASTLE_BACKEND'] = backend` to set the backend(`pytorch` or `mindspore`).
2024-05-10 16:14:07,086 - /home/sabina.jangirova/.conda/envs/ai701/lib/python3.8/site-packages/castle/algorithms/__init__.py[line:36] - INFO: You are using ``pytorch`` as the backend.


In [8]:
def plot_graph(input_graph, node_lookup):
    '''
    Function to visualise graphs.

    Args:
        input_graph (array): Adjacency matrix representing graph
        node_lookup (dict): Dictionary containing node names.
    '''
    
    graph = nx.DiGraph(input_graph)

    plt.figure(figsize=(8,8))
    nx.draw(
    G=graph,
    node_color=COLORS[0],
    node_size=8000,
    arrowsize=17,
    with_labels=True,
    labels=node_lookup,
    font_color='white',
    font_size=9,
    pos=nx.circular_layout(graph)
    )

In [9]:
y = df['target']
df = df.drop(columns=["case_id"])

In [10]:
df[cat_cols] = df[cat_cols].astype(str)
import polars as pl
from sklearn.preprocessing import OrdinalEncoder

# Fit Ordinal Encoder on Training Data
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
encoder.fit(df[cat_cols])

# Transform Training Data
df[cat_cols] = encoder.transform(df[cat_cols])
df[cat_cols] = df[cat_cols].fillna(-1)
df[cat_cols] = df[cat_cols].astype(int)

In [11]:
df = df.fillna(0)

In [12]:
np.random.seed(42)
node_lookup = {i: column for i, column in enumerate(df.columns)}
total_nodes = len(node_lookup)

In [13]:
df = df.drop(columns=['target'])

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [15]:
weeks = X_train['WEEK_NUM']
X_train = X_train.drop(columns=['WEEK_NUM'])
X_test = X_test.drop(columns=['WEEK_NUM'])

In [16]:
# from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()
features_standardized = scaler.fit_transform(X_train)
n_samples = 500

In [18]:
from joblib import dump, load
import time
from causallearn.search.ConstraintBased.FCI import fci
from causallearn.utils.GraphUtils import GraphUtils
import resource

def using():
    usage=resource.getrusage(resource.RUSAGE_SELF)
    return usage[2]/1024.0

In [19]:
edg_list = load('saved_models/fci/edges.joblib')

In [60]:
components = []
for edges in edg_list:
    components.append([])
    print("--------")
    for e in edges:
        if "X251" in str(e):
            print(str(e))
            components[-1].append(str(e).split(" ")[0][1:])
    generation_back = []
    for e in edges:
        if any(node in str(e) for node in components[-1]):
            print(str(e))
            generation_back.append(str(e).split(" ")[0][1:])
    components[-1] = components[-1] + generation_back
    components[-1] = list(set(components[-1]))

--------
X6 <-> X251
X84 <-> X251
X168 <-> X251
X196 <-> X251
X5 <-> X6
X5 <-> X176
X6 <-> X81
X6 <-> X151
X6 <-> X251
X7 <-> X46
X7 <-> X226
X8 <-> X164
X10 <-> X164
X15 <-> X16
X16 <-> X17
X16 <-> X18
X16 <-> X20
X16 <-> X204
X18 <-> X169
X20 <-> X156
X21 <-> X84
X21 <-> X186
X23 <-> X26
X24 <-> X26
X26 <-> X39
X26 <-> X101
X26 <-> X115
X26 <-> X143
X27 <-> X116
X29 <-> X96
X29 <-> X164
X30 <-> X36
X30 <-> X106
X31 <-> X36
X32 <-> X36
X32 <-> X69
X32 <-> X236
X34 <-> X36
X34 <-> X165
X36 <-> X112
X36 <-> X135
X36 <-> X146
X36 <-> X152
X38 <-> X167
X38 <-> X206
X41 <-> X136
X44 <-> X46
X44 <-> X136
X46 <-> X48
X46 <-> X51
X46 <-> X58
X46 <-> X178
X46 <-> X249
X48 <-> X56
X50 <-> X56
X51 <-> X56
X52 <-> X65
X53 <-> X56
X54 <-> X61
X55 <-> X69
X56 <-> X57
X56 <-> X61
X56 <-> X63
X58 <-> X60
X58 <-> X184
X59 <-> X63
X65 --> X59
X59 <-> X67
X60 <-> X64
X60 <-> X72
X60 <-> X87
X60 <-> X105
X60 <-> X247
X61 <-> X62
X61 <-> X64
X61 <-> X91
X61 <-> X131
X61 <-> X207
X62 <-> X63
X62 <-> X73
X6

In [23]:
# fci_times = []
# fci_memory = []
# graphs = []
# edg_list = []
# for n in range(n_samples, 99, -100):
#     random_indeces = np.random.choice(X_train.shape[0], size=n, replace=False)
#     X_train_part = features_standardized[random_indeces]
#     y_train = np.array(y_train)
#     y_train_part = y_train[random_indeces]
#     y_train_part = y_train_part.reshape((n, 1))
#     graph_data = np.concatenate((X_train_part, y_train_part), axis=1)
#     start_time = time.time()
#     start_memory = using()
#     g, edges = fci(graph_data, independence_test_method="fisherz")
#     fci_memory.append(start_memory - using())
#     fci_times.append(start_time - time.time())
#     graphs.append(g)
#     edg_list.append(edges)
#     pdy = GraphUtils.to_pydot(g)
#     pdy.write_png(f'all_features_graph_{n}_samples.png')
#     gc.collect()
#     print(f"{n} samples finished")

In [50]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device_type": "cpu", 
    "verbose": 1,
}

In [51]:
ipca = load("saved_models/pca/250_n_components_1_fold_828348114496282_auc.joblib")
features_standardized = ipca.transform(features_standardized)

In [62]:
lbgm_fci_times = []
lgbm_fci_memory = []
cv_scores = []
fitted_models = []

for c in components[:4]:
    print(f"Components:{c}")
    cv = StratifiedGroupKFold(n_splits=5, shuffle=False)
    f = 0
    c = [int(k) for k in c]
    for idx_train, idx_valid in cv.split(features_standardized, y_train, groups=weeks):
        print(f"Fold #{f}")
        X_train_l, y_train_l = features_standardized[idx_train], y_train.iloc[idx_train]
        X_valid, y_valid = features_standardized[idx_valid], y_train.iloc[idx_valid]
        X_train_reduced = X_train_l[:, c]
        X_valid_reduced = X_valid[:, c]
        start_time = time.time()
        start_memory = using()
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train_reduced, y_train_l,
            eval_set = [(X_valid_reduced, y_valid)],
            callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
        fitted_models.append(model)
        lbgm_fci_times.append(start_time - time.time())
        lgbm_fci_memory.append(start_memory - using())
        y_pred_valid = model.predict_proba(X_valid_reduced)[:,1]
        auc_score = roc_auc_score(y_valid, y_pred_valid)
        cv_scores.append(auc_score)
        f+=1
        gc.collect()

Components:['18', '160', '175', '85', '93', '123', '153', '24', '94', '46', '32', '63', '48', '223', '154', '124', '164', '88', '130', '161', '21', '92', '83', '68', '54', '16', '86', '217', '118', '169', '75', '7', '119', '65', '233', '27', '84', '29', '64', '41', '80', '226', '146', '158', '51', '126', '210', '38', '34', '36', '53', '69', '202', '163', '149', '56', '147', '8', '138', '159', '60', '155', '166', '26', '74', '199', '184', '44', '131', '66', '91', '15', '215', '109', '116', '71', '135', '50', '31', '206', '82', '134', '132', '6', '52', '113', '111', '20', '204', '100', '150', '106', '182', '76', '99', '167', '10', '142', '196', '61', '77', '62', '176', '186', '59', '55', '5', '221', '30', '96', '105', '139', '162', '23', '67', '79', '165', '58', '181', '103', '168']
Fold #0
[LightGBM] [Info] Number of positive: 30178, number of negative: 947050
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.096669 seconds.
You can set `force_col_w

In [53]:
print("CV AUC scores: ", cv_scores)
print("Maximum CV AUC score: ", max(cv_scores))

CV AUC scores:  [0.6167224175172841, 0.6192343737176871, 0.6166453354394873, 0.6211125255989607, 0.6108099811852672, 0.5763609808523109, 0.5748252119741343, 0.574092686111206, 0.5795192684021264, 0.5719988401350442, 0.5644166277963943, 0.5612872889201967, 0.5609980424884227, 0.5613329702578147, 0.555736446406293, 0.5572091016232772, 0.5494137299745139, 0.5569395651023228, 0.5560817263373599, 0.5487591766063173]
Maximum CV AUC score:  0.6211125255989607


In [56]:
# from sklearn.metrics import confusion_matrix, classification_report
# from sklearn.model_selection import StratifiedGroupKFold
# import numpy as np

# conf_matrix = None
# class_report = None

# components[0] = [int(k) for k in components[0]]

# # Predict labels for validation data
# y_pred_labels = fitted_models[3].predict(X_test[:, components[0]])

# # Compute confusion matrix
# conf_matrix = confusion_matrix(y_test, y_pred_labels)

# # Compute classification report
# class_report = classification_report(y_test, y_pred_labels)

# # Print confusion matrix and classification report
# print("Confusion Matrix:")
# print(conf_matrix)
# print("\nClassification Report:")
# print(class_report)

InvalidIndexError: (slice(None, None, None), [6, 84, 168, 196])

In [63]:
from joblib import dump, load
dump(lbgm_fci_times, "./saved_models/fci/1_more_generation_lbgm_fci_times.joblib")
dump(lgbm_fci_memory, "./saved_models/fci/1_more_generation_lgbm_fci_memory.joblib")
dump(cv_scores, "./saved_models/fci/1_more_generation_cv_scores.joblib")
dump(fitted_models, "./saved_models/fci/1_more_generation_fitted_models.joblib")

['./saved_models/fci/1_more_generation_fitted_models.joblib']

In [27]:
# from sklearn.metrics import confusion_matrix, classification_report
# from sklearn.model_selection import StratifiedGroupKFold
# import numpy as np

# conf_matrix = None
# class_report = None
# X_test = scaler.transform(X_test)
# X_test = ipcas[1].transform(X_test)

# # Predict labels for validation data
# y_pred_labels = fitted_models[1].predict(X_test)

# # Compute confusion matrix
# conf_matrix = confusion_matrix(y_test, y_pred_labels)

# # Compute classification report
# class_report = classification_report(y_test, y_pred_labels)

# # Print confusion matrix and classification report
# print("Confusion Matrix:")
# print(conf_matrix)
# print("\nClassification Report:")
# print(class_report)

In [29]:
# n_components = 250
# counter = 0
# for auc, ipca in zip(cv_scores, ipcas):
#     if counter == 5:
#         counter = 0
#         n_components -= 50
#     auc = str(auc)
#     auc = auc.split('.')[1]
#     filename = f"{n_components}_n_components_{counter}_fold_{auc}_auc.joblib"
#     dump(ipca, "./saved_models/pca/"+filename)
#     counter += 1

In [30]:
# ipca = IncrementalPCA(n_components=300)
# principalComponents = ipca.fit_transform(features_standardized)
# principalDf = pd.DataFrame(data = principalComponents,
#                            columns = ['PC' + str(i) for i in range(1, ipca.n_components_ + 1)])
# finalDf = pd.concat([principalDf, df_train[['target']]], axis=1)

In [31]:
# print("Original number of features:", features.shape[1])
# print("Reduced number of features:", principalDf.shape[1])

# # Display some of the data
# print(finalDf.head())

In [32]:
# from causallearn.search.ConstraintBased.PC import PC
# from causallearn.utils.GraphUtils import GraphUtils

In [33]:
# data = df_train_part.iloc[:,:].values

In [34]:
# g, edges = fci(data, independence_test_method="chisq", verbose=True, show_progress=True)

In [35]:
# pdy = GraphUtils.to_pydot(g)
# pdy.write_png('graph.png')

In [36]:
# pc = PC(variant="stable")
# pc.learn(df_train_part)
# graph_pred = pc.causal_matrix

# graph_pred

In [37]:
# y = df_train["target"]
# weeks = df_train["WEEK_NUM"]
# df_train= df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
# cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

In [38]:
# df_train[cat_cols] = df_train[cat_cols].astype(str)
# import polars as pl
# from sklearn.preprocessing import OrdinalEncoder


# # Fit Ordinal Encoder on Training Data
# encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
# encoder.fit(df_train[cat_cols])

# # Transform Training Data
# df_train[cat_cols] = encoder.transform(df_train[cat_cols])
# df_train[cat_cols] = df_train[cat_cols].fillna(-1)
# df_train[cat_cols] = df_train[cat_cols].astype(int)


In [39]:
# test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
#     pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
#     (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
# )

# test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
#     pl.col("num_group1") == 0
# ).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
#     pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
#     (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
# )

# data_submission = test_basetable.join(
#     test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
# ).join(
#     test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
# ).join(
#     test_person_1_feats_1, how="left", on="case_id"
# ).join(
#     test_person_1_feats_2, how="left", on="case_id"
# ).join(
#     test_credit_bureau_b_2_feats, how="left", on="case_id"
# )

In [40]:
# case_ids = data["case_id"].unique().shuffle(seed=1)
# case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
# case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

# cols_pred = []
# for col in data.columns:
#     if col[-1].isupper() and col[:-1].islower():
#         cols_pred.append(col)

# print(cols_pred)

# def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
#     return (
#         data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
#         data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
#         data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
#     )

# base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
# base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
# base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

# for df in [X_train, X_valid, X_test]:
#     df = convert_strings(df)

In [41]:
# print(f"Train: {X_train.shape}")
# print(f"Valid: {X_valid.shape}")
# print(f"Test: {X_test.shape}")

## Training LightGBM

Minimal example of LightGBM training is shown below.

In [42]:
# lgb_train = lgb.Dataset(X_train, label=y_train)
# lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

# params = {
#     "boosting_type": "gbdt",
#     "objective": "binary",
#     "metric": "auc",
#     "max_depth": 3,
#     "num_leaves": 31,
#     "learning_rate": 0.05,
#     "feature_fraction": 0.9,
#     "bagging_fraction": 0.8,
#     "bagging_freq": 5,
#     "n_estimators": 1000,
#     "verbose": -1,
# }

# gbm = lgb.train(
#     params,
#     lgb_train,
#     valid_sets=lgb_valid,
#     callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
# )

Evaluation with AUC and then comparison with the stability metric is shown below.

In [43]:
# for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
#     y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
#     base["score"] = y_pred

# print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
# print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
# print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  

In [44]:
# def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
#     gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
#         .sort_values("WEEK_NUM")\
#         .groupby("WEEK_NUM")[["target", "score"]]\
#         .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
#     x = np.arange(len(gini_in_time))
#     y = gini_in_time
#     a, b = np.polyfit(x, y, 1)
#     y_hat = a*x + b
#     residuals = y - y_hat
#     res_std = np.std(residuals)
#     avg_gini = np.mean(gini_in_time)
#     return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

# stability_score_train = gini_stability(base_train)
# stability_score_valid = gini_stability(base_valid)
# stability_score_test = gini_stability(base_test)

# print(f'The stability score on the train set is: {stability_score_train}') 
# print(f'The stability score on the valid set is: {stability_score_valid}') 
# print(f'The stability score on the test set is: {stability_score_test}')  