In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

In [1]:
def prepare_data(data, is_train=True, medians=None, sector_aggregations=None):
    """
    Function to prepare and transform the financial data for machine learning.
    
    Parameters:
    data (pd.DataFrame): The input data to be prepared.
    is_train (bool): Flag to indicate if the data is training data. Default is True.
    medians (pd.Series): Precomputed median values for the columns. Default is None.
    sector_aggregations (pd.DataFrame): Precomputed sector-based aggregations. Default is None.
    
    Returns:
    pd.DataFrame: The prepared and transformed data.
    pd.Series: The median values used for filling missing values (only when is_train=True).
    pd.DataFrame: The sector-based aggregations (only when is_train=True).
    """
    
    data.replace("NA", -999, inplace=True)
    
    data.replace("", float("nan"), inplace=True)
    
    if is_train:
        medians = data.median()
    
    data.fillna(medians, inplace=True)
    
    data['Debt_to_Equity'] = data['Debt - Total to EBITDA, TTM'] / data['Net Debt to Total Equity']
    data['Current_Ratio_Diff'] = data['Current Ratio'] - data['Quick Ratio']
    
    # feature interactions
    data['ROA_ROE_interaction'] = data['Return on Average Total Assets - %, TTM'] * data['Return on Average Common Equity - %, TTM']
    data['Debt_to_Equity_interaction'] = data['Debt - Total to EBITDA, TTM'] * data['Net Debt to Total Equity']
    data['EBITDA_NetMargin_interaction'] = data['EBITDA Margin - %, TTM'] * data['Net Margin - %, TTM']
    data['PE_DividendYield_interaction'] = data['Price to EBITDA per Share, TTM'] * data['Dividend Yield - Common - Net - Issue - %, TTM']
    data['Operating_GrossMargin_interaction'] = data['Operating Margin - %, TTM'] * data['Gross Profit Margin - %, TTM']
    
    # log transform for skewed features
    skewed_features = ['Return on Average Total Assets - %, TTM', 'Return on Average Common Equity - %, TTM']
    for feature in skewed_features:
        data[f'{feature}_log'] = np.log1p(data[feature])
    
    # sector-based aggregations
    if is_train:
        sector_aggregations = data.groupby('Industry sector').agg({
            'Return on Average Total Assets - %, TTM': ['mean', 'std'],
            'Return on Average Common Equity - %, TTM': ['mean', 'std'],
        }).reset_index()
        
        sector_aggregations.columns = ['_'.join(col).strip() for col in sector_aggregations.columns.values]
        sector_aggregations.rename(columns={'Industry sector_':'Industry sector'}, inplace=True)
    
    # Merge the sector aggregations back to the original data
    data = data.merge(sector_aggregations, on='Industry sector', how='left')
    data.loc[:,"Industry sector"] = data["Industry sector"].astype("category")
    
    return data, medians, sector_aggregations




def custom_loss_function(y_true, y_pred):
    cost_matrix = np.array([
        [0, 1, 2],
        [1, 0, 1],
        [2, 1, 0]
    ])
    conf_matrix = confusion_matrix(y_true, y_pred)
    error = np.sum(conf_matrix * cost_matrix) / len(y_true)
    return error

custom_metric = {
    'name': 'custom_loss', 
    'score_func': custom_loss_function, 
    'greater_is_better': False
}


In [48]:
raw = pd.read_csv("data/training_data.csv", sep=";", decimal=",")
raw_test = pd.read_csv("data/test_data_no_target.csv", sep=";", decimal=",")

col_dictionary = pd.read_csv("data/column_names_dictionary.csv", sep=";")
name_mapping = col_dictionary.set_index('CODE')['INDICATOR NAME'].to_dict()

raw = raw.rename(columns=name_mapping)
raw_test = raw_test.rename(columns=name_mapping)
data, medians, sector_aggregations = prepare_data(raw, is_train=True)
test_data_prepared, _, _ = prepare_data(raw_test, is_train=False, medians=medians, sector_aggregations=sector_aggregations)

X_train = data.drop(columns=['Class', 'Perform'])
y_train = data['Class']
class_mapping = {-1: 0, 0: 1, 1: 2}
y_train_mapped = y_train.map(class_mapping)

X_test = test_data_prepared.drop(columns=['Class', 'Perform'], errors='ignore')

In [49]:
X_train.info(max_cols=140)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 0 to 7999
Data columns (total 130 columns):
 #    Column                                                                                Non-Null Count  Dtype   
---   ------                                                                                --------------  -----   
 0    Industry sector                                                                       8000 non-null   category
 1    Return on Average Total Assets - %, TTM                                               8000 non-null   float64 
 2    Return on Average Common Equity - %, TTM                                              8000 non-null   float64 
 3    EBITDA Percentage of Common Equity, TTM                                               8000 non-null   float64 
 4    EBITDA Percentage of Total Fixed Assets - Net, TTM                                    8000 non-null   float64 
 5    Excess Cash Margin - %                                             

In [60]:
from utils import CustomCMLoss

def custom_objective(y_true, y_pred):
    # Convert the predictions to probabilities
    loss = CustomCMLoss()
    grad, hess = loss(y_true, y_pred)
    
    return grad, hess

In [64]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score


# Create a scorer using the custom metric
custom_scorer = make_scorer(custom_loss_function)

xgb_model = XGBClassifier( enable_categorical=True, objective=custom_objective)
# Perform cross-validation
cv_scores = cross_val_score(xgb_model, X_train, y_train_mapped, cv=5, scoring=custom_scorer)

# Print cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

Cross-validation scores: [1.083125 1.084375 1.084375 1.084375 1.08375 ]
Mean cross-validation score: 1.084


In [51]:
cv_scores, np.m

array([0.911875, 0.888125, 0.899375, 0.8775  , 0.8875  ])

In [54]:
xgb_model.fit(X_train, y_train_mapped)

In [55]:
y_pred = xgb_model.predict(X_train)