In [2]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import KFold
from net_archs import LSTMModel
import statistics
from sklearn.metrics import mean_squared_error,  roc_auc_score, f1_score
import datetime

In [3]:
#Load labels
sp_ratings = pd.read_excel('./dataset/labels.xlsx', sheet_name='sp-ratings')
label2rating = {
    0: 'AAA',
    1: 'AA+',
    2: 'AA',
    3: 'AA-',
    4: 'A+',
    5: 'A',
    6: 'A-',
    7: 'BBB+',
    8: 'BBB',
    9: 'BBB-',
    10: 'BB+',
    11: 'BB',
    12: 'BB-',
    13: 'B+',
    14: 'B',
    15: 'B-',
    16: 'CCC+',
    17: 'CCC',
    18: 'CCC-',
    19: 'CC',
    20: 'C',
    21: 'D',
}
rating2label = dict((v,k) for k,v in label2rating.items())
rating2label['SD'] = 21

In [4]:
# Select appropriate quarters
q4_ratings = sp_ratings[['2014Q4', '2015Q4', '2016Q4']]
q4_labels = q4_ratings.replace(rating2label)
q4_labels[~q4_labels['2015Q4'].isin(label2rating.keys())]

print(q4_labels.head())

   2014Q4  2015Q4  2016Q4
0      13      12      12
1       6       6       6
2       4       4       4
3      10      10      10
4       5       5       6


In [5]:
# Load features
df_14q4 = pd.read_csv('./dataset/2014Q4.csv')
df_14q4.fillna(df_14q4.mean(), inplace=True)

df_15q4 = pd.read_csv('./dataset/2015Q4.csv')
df_15q4.fillna(df_15q4.mean(), inplace=True)

df_16q4 = pd.read_csv('./dataset/2016Q4.csv')
df_16q4.fillna(df_16q4.mean(), inplace=True)

In [6]:
# # Transform to time series
# from sklearn.model_selection import KFold

df_14q4_mod = df_14q4.drop(columns=['year', 'Total debt/total asset', 'Earnings Per Share from Operations', 'total asset/total libiilities', 'gross profit/rev', 'EBTI/total asset', 'Book Value Per Share'], inplace=False, errors='ignore')
df_15q4_mod = df_15q4.drop(columns=['year', 'Total debt/total asset', 'Earnings Per Share from Operations', 'total asset/total libiilities', 'gross profit/rev', 'EBTI/total asset', 'Book Value Per Share'], inplace=False, errors='ignore')
df_16q4_mod = df_16q4.drop(columns=['year', 'Total debt/total asset', 'Earnings Per Share from Operations', 'total asset/total libiilities', 'gross profit/rev', 'EBTI/total asset', 'Book Value Per Share'], inplace=False, errors='ignore')
df_14q4_mod = (df_14q4_mod-df_14q4_mod.mean())/df_14q4_mod.std()
df_15q4_mod = (df_15q4_mod-df_15q4_mod.mean())/df_15q4_mod.std()
df_16q4_mod = (df_16q4_mod-df_16q4_mod.mean())/df_16q4_mod.std()
df_14q4_mod['label'] = q4_labels['2014Q4']
df_15q4_mod['label'] = q4_labels['2015Q4']
df_16q4_mod['label'] = q4_labels['2015Q4']

ts14 = torch.tensor(df_14q4_mod.values.astype('float32'))
ts15 = torch.tensor(df_15q4_mod.values.astype('float32'))
ts16 = torch.tensor(df_16q4_mod.values.astype('float32'))
ts = torch.stack([ts14, ts15, ts16], 1)

ts_labels = torch.tensor(q4_labels['2016Q4'])

In [7]:
def ordinal_criterion(predictions, targets):
  # Ordinal regression with encoding as in https://arxiv.org/pdf/0704.1028.pdf

  # Create out modified target with [batch_size, num_labels] shape
  modified_target = torch.zeros_like(predictions)

  # Fill in ordinal target function, i.e. 0 -> [1,0,0,...]
  for i, target in enumerate(targets):
    modified_target[i, 0:int(target)+1] = 1

  return nn.MSELoss(reduction='none')(predictions, modified_target).sum(axis=1)

In [8]:
# Define train and test function
def train(net, optimizer, x, y, x_test, y_test, num_epoch=2, batch_size=16, criterion=ordinal_criterion):
  print_every = 10

  for n in range(num_epoch):
    # Mini batch sgd
    permutation = torch.randperm(x.size()[0])
    for i in range(0, x.size()[0], batch_size):
      indices = permutation[i:i+batch_size]
      x_mini, y_mini = x[indices], y[indices]
      y_pred = net(x_mini)
      loss = criterion(y_pred.float(), y_mini.float()) # default criterion = ordinal_criterion
      optimizer.zero_grad()
      loss.mean().backward()
      optimizer.step()

    if (n + 1) % print_every == 0:
      mse, _, auc, _ = test(net, x_test, y_test)
      print(f'Epoch: {n + 1}, Loss: {loss.sum()}, MSE {mse} AUC {auc}')




def prediction2label(pred: np.ndarray):
  """Convert ordinal predictions to class labels, e.g.

  [0.9, 0.1, 0.1, 0.1] -> 0
  [0.9, 0.9, 0.1, 0.1] -> 1
  [0.9, 0.9, 0.9, 0.1] -> 2
  etc.
  """
  return (pred > 0.5).cumprod(axis=1).sum(axis=1) - 1

def test(net, x_test, y_test):
  y_pred = net(x_test)
  if y_pred.shape[1] == 21:
    y_pred = prediction2label(net(x_test))
  else:
    y_pred = torch.clip(y_pred.round(), min=0, max=21)

  y_default_test = np.where(y_test.detach().numpy() > 9, 1, 0)
  y_default_pred = np.where(y_pred.detach().numpy() > 9, 1, 0)

  #print(y_default_pred)
  mse = mean_squared_error(y_test.detach().numpy(), y_pred.detach().numpy())
  mse_norm = mean_squared_error(y_test.detach().numpy() / 21, y_pred.detach().numpy() / 21)
  auc = roc_auc_score(y_default_test, y_default_pred)
  f1 = f1_score(y_default_test, y_default_pred)
  #print(f'auc {auc}  mse {mse}')
  return mse, mse_norm, auc, f1


In [9]:
from sklearn.model_selection import StratifiedKFold

def fl_split(split, x_train, y_train, test=False):
    x_eq = []
    y_eq = []

    n_splits = 50
    if test==False:
        skf = KFold(n_splits=n_splits)
    else:
        skf = StratifiedKFold(n_splits=n_splits)
    skf.get_n_splits(x_train, y_train)

    for i, (_, test_index) in enumerate(skf.split(x_train, y_train)):
        x_eq.append(x_train[test_index])
        y_eq.append(y_train[test_index])

    x_split = []
    y_split = []

    acc = 0
    for s in split:
        x_split.append(torch.cat(x_eq[acc:acc+int(s*n_splits)], 0))
        y_split.append(torch.cat(y_eq[acc:acc+int(s*n_splits)], 0))
        acc += int(s*n_splits)

    return x_split, y_split

In [10]:
from imblearn.over_sampling import RandomOverSampler

def run_local(split, X=ts, y=ts_labels):

    mses = []
    aucs = []

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    start = datetime.datetime.now()
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print(f"Fold {fold_idx + 1}:")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        random_state = np.random.randint(1000)

        x_split, y_split = fl_split(split, X_train, y_train)

        for X_train, y_train in zip(x_split, y_split):
            ros = RandomOverSampler(random_state=random_state)
            X_resampled, y_resampled = ros.fit_resample(X_train.reshape(-1, 3*23), y_train)
            X_train = X_resampled.reshape(X_resampled.shape[0], 3, 23)
            X_train = torch.from_numpy(X_train)
            y_train = torch.from_numpy(y_resampled)

            lstm = LSTMModel(23, 21, hidden_dim=64, n_layers=2)
            optimizer = torch.optim.Adam(lstm.parameters(), lr=0.0001, weight_decay=0.00001)
            train(lstm, optimizer, X_train, y_train, X_test, y_test, batch_size=4, num_epoch=128)

            print('x_test shape', X_test.shape)

            mse, mse_norm, auc, f1 = test(lstm, X_test, y_test)

            aucs.append(auc)
            mses.append(mse)

    end = datetime.datetime.now()
    time = end - start

    print('split',split)
    print('Training time: ', time)
    print('mse',mses)
    print('auc',aucs)


In [11]:
# split =[1]
# run_local(split, ts, ts_labels)

In [12]:
# split =[0.5, 0.5]
# run_local(split)

In [13]:
# split =[0.6, 0.4]
# run_local(split)

In [14]:
# split =[0.8, 0.2]
# run_local(split)

In [15]:
# split = [0.34, 0.33, 0.33]
# run_local(split)

In [16]:
# split = [0.6, 0.2, 0.2]
# run_local(split)

In [17]:
# split = [0.8, 0.1, 0.1]
# run_local(split)

In [18]:
# split =[0.2, 0.2, 0.2, 0.2, 0.2]
# run_local(split)

In [19]:
# split = [0.6, 0.1, 0.1, 0.1, 0.1]
# run_local(split)

In [20]:
# split = [0.8, 0.05, 0.05, 0.05, 0.05]
# run_local(split)

In [21]:
# split = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
# run_local(split)

In [22]:
split = [0.6, 0.05, 0.05, 0.05, 0.05, 0.04, 0.04, 0.04, 0.04, 0.04]
run_local(split)



Fold 1:
Epoch: 10, Loss: 2.0376784801483154, MSE 0.9060773480662984 AUC 0.9500855327468232
Epoch: 20, Loss: 1.4278347492218018, MSE 0.7900552486187845 AUC 0.9500855327468232
Epoch: 30, Loss: 2.5661075115203857, MSE 0.7403314917127072 AUC 0.9506964809384165
Epoch: 40, Loss: 1.6293890476226807, MSE 0.6961325966850829 AUC 0.9500855327468232
Epoch: 50, Loss: 1.193361520767212, MSE 0.7348066298342542 AUC 0.9500855327468232
Epoch: 60, Loss: 0.3542836904525757, MSE 0.7513812154696132 AUC 0.9500855327468232
Epoch: 70, Loss: 2.0560085773468018, MSE 0.7182320441988951 AUC 0.9500855327468232
Epoch: 80, Loss: 1.3434958457946777, MSE 0.8066298342541437 AUC 0.9560728250244379
Epoch: 90, Loss: 0.14341680705547333, MSE 0.6906077348066298 AUC 0.9560728250244379
Epoch: 100, Loss: 0.15579402446746826, MSE 0.7016574585635359 AUC 0.9560728250244379
Epoch: 110, Loss: 0.8042114973068237, MSE 0.7071823204419889 AUC 0.9560728250244379
Epoch: 120, Loss: 0.3244974911212921, MSE 0.7292817679558011 AUC 0.945625610

In [23]:
split = [0.8, 0.03, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02]
run_local(split)