In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PowerTransformer
from xgboost import XGBClassifier

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# load the data
data = pd.read_csv('Portfolio Data.csv')

# Split the dataset
X = data.drop(columns=['Target'])
y = data['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Data processing functions
def process_data(df):
    df = df.copy()  

    # Convert date columns
    df['CurrentDate'] = pd.to_datetime(df['CurrentDate'], errors='coerce')
    selected_columns = ['Payment2', 'Payment4', 'AccountDetail8', 'AccountDetail2']
    for col in selected_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        time_diff = df['CurrentDate'] - df[col]
        df[col] = time_diff.dt.days

    # Binarization
    selected_columns = ['AccountStatus2', 'PrevAccountDetail1']
    for col in selected_columns:
        if col in df.columns:
            df[col] = df[col].notna().astype(int)

    # Mean value calculation
    if 'PrevBalance1' in df.columns and 'Balance1' in df.columns:
        df['Balance1'] = (df['PrevBalance1'] + df['Balance1']) / 2

    # Fill in missing values
    df['HistoricalAccountActivity7'].fillna(1, inplace=True)
    df['HistoricalAccountStatus14'].fillna(2, inplace=True)

    # Delete extraneous columns
    df.drop(columns=['CurrentDate', 'UniqueID', 'PrevBalance1'], inplace=True, errors='ignore')

    # Populate specified columns with column averages
    selected_columns = ['AccountDetail3', 'AccountDetail7', 'AccountActivity4', 'AccountActivity5', 
                        'AccountActivity6', 'Balance1', 'Balance2', 'Payment1', 
                        'HistoricalAccountActivity1', 'HistoricalAccountActivity2', 
                        'HistoricalAccountActivity3', 'HistoricalAccountActivity4', 
                        'HistoricalAccountActivity5', 'HistoricalAccountActivity6', 
                        'HistoricalAccountDetail1', 'HistoricalAccountDetail2', 
                        'HistoricalAccountDetail3', 'HistoricalAccountStatus1', 
                        'HistoricalAccountStatus10', 'HistoricalBalance1', 'Payment3']
    df[selected_columns] = df[selected_columns].apply(lambda col: col.fillna(col.mean()), axis=0)

    # Populate other columns with plurals
    exclude_columns = selected_columns + ['HistoricalAccountActivity7', 'HistoricalAccountStatus14']
    columns_to_fill = [col for col in df.columns if col not in exclude_columns]
    for col in columns_to_fill:
        if col in df.columns:
            most_frequent_value = df[col].mode()[0]
            df[col] = df[col].fillna(most_frequent_value)

    # Column-specific value replacement
    if 'AccountDetail5' in df.columns:
        df['AccountDetail5'] = df['AccountDetail5'].apply(lambda x: 1 if x == 'X' else 0)
    if 'AccountDetail6' in df.columns:
        df['AccountDetail6'] = df['AccountDetail6'].apply(lambda x: 1 if x == 'X' else 0)

    # dummy transfirmation
    tr_col = ['PrevAccountStatus2', 'AccountStatus1']
    if all(col in df.columns for col in tr_col):
        df[tr_col] = df[tr_col].applymap(
            lambda x: 1 if x == 'X' else 2 if pd.isna(x) or x == '' else 3 if x == 'D' 
            else 4 if x == 'N' else 0 if x == 'O' else 5 if x == 'C' else x
        )

    tr_col1 = ['PrevAccountStatus1']
    if all(col in df.columns for col in tr_col1):
        df[tr_col1] = df[tr_col1].applymap(
            lambda x: 1 if x == 'A' else 0 if pd.isna(x) or x == '' else 2 if x == 'E' 
            else 3 if x == 'I' else 4 if x == 'C' else 5 if x == 'B' 
            else 6 if x == 'Z' else 7 if x == 'F' else x
        )

    return df

# Process training and test sets separately
X_train_processed = process_data(X_train)
X_test_processed = process_data(X_test)

# Confirmation of processed data
print(X_train_processed.head())
print(X_test_processed.head())


        AccountActivity1  AccountActivity2  AccountDetail1  AccountDetail2  \
79000                  0               0.0               1             854   
222163                 0               0.0               1             153   
147313                 0               0.0               1            1035   
201668                 0               0.0               1             120   
143237                 0               0.0               1             823   

        AccountDetail3  AccountDetail4  AccountDetail5  AccountDetail6  \
79000              600               0               1               1   
222163             350               0               1               1   
147313             600               7               1               1   
201668             350               0               1               1   
143237             750               0               1               1   

        AccountDetail7  AccountStatus1  ...  HistoricalAccountStatus9  \
79000        

In [4]:
missing_columns = X_train_processed.isna().any()
print(missing_columns)

AccountActivity1              False
AccountActivity2              False
AccountDetail1                False
AccountDetail2                False
AccountDetail3                False
AccountDetail4                False
AccountDetail5                False
AccountDetail6                False
AccountDetail7                False
AccountStatus1                False
AccountStatus2                False
PrevAccountDetail1            False
PrevAccountStatus1            False
PrevAccountStatus2            False
AccountActivity3              False
AccountActivity4              False
AccountActivity5              False
AccountActivity6              False
AccountActivity7              False
Balance1                      False
Balance2                      False
Payment1                      False
AccountDetail8                False
HistoricalAccountActivity1    False
HistoricalAccountActivity2    False
HistoricalAccountActivity3    False
HistoricalAccountActivity4    False
HistoricalAccountActivity5  

In [None]:
pipeline = Pipeline(steps=[
    ('model', LogisticRegression(max_iter=100))
])

param_grid = {
    'model__C': [0.01, 0.1, 1]
}

grid = GridSearchCV(pipeline, param_grid, scoring=auroc_scorer, cv=5, error_score='raise')
grid.fit(X_train_processed, y_train)

In [None]:
print("Best parameters: {}".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test_processed, y_test)))

In [None]:
auroc_scorer = make_scorer(roc_auc_score, needs_proba=True)

# Create a PowerTransformer (Yeo-Johnson)
power_transformer = PowerTransformer(method='yeo-johnson')

# Transform using ColumnTransformer
ct = ColumnTransformer(
    [('yeo_johnson', power_transformer, [
        'AccountDetail3', 'AccountDetail7', 'AccountActivity4', 'AccountActivity5',
        'AccountActivity6', 'Balance1', 'Balance2', 'Payment1',
        'HistoricalAccountActivity1', 'HistoricalAccountActivity2',
        'HistoricalAccountActivity3', 'HistoricalAccountActivity4',
        'HistoricalAccountActivity5', 'HistoricalAccountActivity6',
        'HistoricalAccountDetail1', 'HistoricalAccountDetail2',
        'HistoricalAccountDetail3', 'HistoricalAccountStatus1',
        'HistoricalAccountStatus10', 'HistoricalBalance1', 'Payment3'])],
    remainder='passthrough')


pipeline = Pipeline([ 
    ('transform', ct),
    ('model', LogisticRegression(max_iter=100))
])


param_grid = {
    'model__C': [0.01, 0.1, 1]
}


grid = GridSearchCV(pipeline, param_grid, scoring=auroc_scorer, cv=5, error_score='raise')
grid.fit(X_train_processed, y_train)

In [16]:

auroc_scorer = make_scorer(roc_auc_score, needs_proba=True)

power_transformer = PowerTransformer(method='yeo-johnson')

ct = ColumnTransformer(
    [('yeo_johnson', power_transformer, [
        'AccountDetail3', 'AccountDetail7', 'AccountActivity4', 'AccountActivity5',
        'AccountActivity6', 'Balance1', 'Balance2', 'Payment1',
        'HistoricalAccountActivity1', 'HistoricalAccountActivity2',
        'HistoricalAccountActivity3', 'HistoricalAccountActivity4',
        'HistoricalAccountActivity5', 'HistoricalAccountActivity6',
        'HistoricalAccountDetail1', 'HistoricalAccountDetail2',
        'HistoricalAccountDetail3', 'HistoricalAccountStatus1',
        'HistoricalAccountStatus10', 'HistoricalBalance1', 'Payment3'])],
    remainder='passthrough')


param_grid = [
    {
        # Logistic Regression
        'transform': [ct],
        'preprocessing': [StandardScaler()],
        'model': [LogisticRegression(max_iter=200, solver='liblinear')],
        'model__C': [0.001, 0.01, 0.1],
        'model__penalty': ['l1', 'l2'],  
        'model__class_weight': [None, 'balanced']  
    },

    {   
        # Random Forest
        'transform': [None],
        'preprocessing': [None],
        'model': [RandomForestClassifier(random_state=42)],
        'model__n_estimators': [50, 100, 200],  
        'model__max_depth': [None, 5, 10],  
    },

   {   
        # XGBoost
        'transform': [None],
        'preprocessing': [None],
        'model': [XGBClassifier(objective='binary:logistic', random_state=42)],
        'model__learning_rate': [0.01, 0.2, 0.5],
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [3, 5, 7]
    }
]



pipeline = Pipeline(steps=[
    ('transform', ct),
    ('preprocessing', StandardScaler()),
    ('feature_selector', SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median')),
    ('model', LogisticRegression()) 
])


grid = GridSearchCV(pipeline, param_grid, scoring=auroc_scorer, cv=5, error_score='raise')

grid.fit(X_train_processed, y_train)



In [17]:
print("Best parameters: {}".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test_processed, y_test)))

Best parameters: {'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...), 'model__learning_rate': 0.2, 'model__max_depth': 5, 'model__n_estimators': 200, 'preprocessing': None, 'transform': None}
Best cross-validation score: 0.82
Test-set score: 0.82


In [18]:
#rank the models
cv_results = grid.cv_results_
models_auroc = {}
for i, params in enumerate(cv_results['params']):
    model_name = type(params['model']).__name__  
    mean_auroc = cv_results['mean_test_score'][i]  
    if model_name not in models_auroc:
        models_auroc[model_name] = []
    models_auroc[model_name].append(mean_auroc)

model_rankings = {model: max(scores) for model, scores in models_auroc.items()}

sorted_models = sorted(model_rankings.items(), key=lambda x: x[1], reverse=True)

print("Model Performance Ranking Based on AUROC:")
for rank, (model, score) in enumerate(sorted_models, 1):
    print(f"{rank}. {model}: {score:.4f}")


Model Performance Ranking Based on AUROC:
1. XGBClassifier: 0.8231
2. RandomForestClassifier: 0.8211
3. LogisticRegression: 0.8026
