In [1]:
import os
import string
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import lxml
import en_core_web_sm
import lightgbm as lgb

from sklearn import preprocessing, model_selection, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

nlp = en_core_web_sm.load()

pd.options.display.max_columns = 1000

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train_df = pd.read_csv("../input/train_labels.csv")
sub_df = pd.read_csv("./baseline_with_dates.csv") ### input the sample sub file

In [3]:
def prepare_data(row, col="RD_party_A_moody_long_term_threshold", keyword="moody", data_path="../input/data/", train_flag=1):
    file_name = row["File_Name"]
    if train_flag:
        party = row[col]
    else:
        party = 0
    features = []
    with open(data_path+file_name+".xml") as fname:
        xml_text = fname.read()
        soup = BeautifulSoup(xml_text)

        full_txt = ""
        all_blocks = soup.find_all("block")
        for ind, block in enumerate(all_blocks):
            try:
                prev_block_text = all_blocks[ind-1].get_text().strip()
            except:
                prev_block_text = "None"
            block_text = block.get_text().strip()
            try:
                next_block_text = all_blocks[ind+1].get_text().strip()
            except:
                next_block_text = "None"
                
            if keyword in block_text.lower():
                full_txt = full_txt + " " + block_text.lower()
                #if party.lower() in block_text.lower():
        #target = party
        features.append([file_name, full_txt])
    return features

full_features = []
for ind, row in train_df.iterrows():
    features = prepare_data(row, col="RD_party_A_moody_long_term_threshold", data_path="../input/data/")
    full_features.extend(features)
full_features = pd.DataFrame(full_features)
full_features.columns = ["file_name", "text"]
full_features.to_csv("train_moody.csv", index=False)
full_features.head()

full_features = []
for ind, row in sub_df.iterrows():
    features = prepare_data(row, col="RD_party_A_moody_long_term_threshold", data_path="../input/public_test_data/", train_flag=0)
    full_features.extend(features)
full_features = pd.DataFrame(full_features)
full_features.columns = ["file_name", "text"]
full_features.to_csv("test_moody.csv", index=False)

In [4]:
train_party_df = pd.read_csv("train_moody.csv")
test_party_df = pd.read_csv("test_moody.csv")

full_df = pd.concat([train_party_df, test_party_df], axis=0)

n_components = 25
train_X = []
test_X = []
# Generate text features:
for i in ["text"]:
    # Initialize decomposition methods:
    print(f'generating features from: {i}')
    tfv = TfidfVectorizer(max_features=None, ngram_range=(1, 3))
    svd_ = TruncatedSVD(n_components=n_components, random_state=1337)
    
    tfidf_col = tfv.fit_transform(full_df[i].astype(str).values)
    svd_.fit(tfidf_col)
    
    tfidf_col = tfv.transform(train_party_df[i].astype(str).values)
    svd_col = svd_.transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    train_X.append(svd_col)
    
    tfidf_col = tfv.transform(test_party_df[i].astype(str).values)
    svd_col = svd_.transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    test_X.append(svd_col)
    
train_X = pd.concat(train_X, axis=1)
test_X = pd.concat(test_X, axis=1)

train_X.to_csv("train_moody_vec.csv", index=False)
test_X.to_csv("test_moody_vec.csv", index=False)

generating features from: text


### RD_party_A_moody_long_term_threshold

In [5]:
def map_target(x):
    s = 0
    if x == "Baa3":
        s = 1
    elif x == "A3":
        s = 2
    elif x =="A2":
        s = 3
    return s
    
target = "RD_party_A_moody_long_term_threshold"
train_y = train_df[target].apply(lambda x: map_target(x))
train_group = train_party_df["file_name"].values

In [6]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=8, seed=0):
    params = {}
    params["objective"] = "multiclass"
    params['metric'] = 'multi_logloss'
    params['num_class'] = 4
    params["max_depth"] = dep
    params["min_data_in_leaf"] = 1
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed
    #params["lambda_l2"] = 0.01
    params["verbosity"] = -1
    num_rounds = 2000

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    #imps = model.feature_importance()
    #names = model.feature_name()
    #for fi, fn in enumerate(names):
    #    print(fn, imps[fi])

    loss = 0
    if test_y is not None:
        loss = metrics.log_loss(test_y, pred_test_y, labels=[0,1,2,3])
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [7]:
cv_scores = []
pred_test_full = 0
pred_train = np.zeros([train_X.shape[0], 4])
n_splits = 5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
#gkf = model_selection.GroupKFold(n_splits=n_splits)
model_name = "lgb"
for dev_index, val_index in skf.split(train_X, train_y):
    dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X)
    
    pred_train[val_index] = pred_v
    pred_test_full += (pred_t/float(n_splits))
    
    cv_scores.append(loss)
    print(cv_scores)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's multi_logloss: 0.486787
[200]	valid_0's multi_logloss: 0.326343
[300]	valid_0's multi_logloss: 0.269182
[400]	valid_0's multi_logloss: 0.258878
[500]	valid_0's multi_logloss: 0.273704
Early stopping, best iteration is:
[365]	valid_0's multi_logloss: 0.256236
0.2562358757794602
[0.2562358757794602]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's multi_logloss: 0.49713
[200]	valid_0's multi_logloss: 0.379629
[300]	valid_0's multi_logloss: 0.37451
[400]	valid_0's multi_logloss: 0.406314
Early stopping, best iteration is:
[263]	valid_0's multi_logloss: 0.369194
0.3691937557786826
[0.2562358757794602, 0.3691937557786826]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's multi_logloss: 0.604564
[200]	valid_0's multi_logloss: 0.501376
[300]	valid_0's multi_logloss: 0.507934
[400]	valid_0's multi_logloss: 0.549376
Early stopping, best iteration is:
[225]	va

In [8]:
metrics.accuracy_score(train_y, pred_train.argmax(axis=1))

0.8795180722891566

In [9]:
pred_test = pred_test_full.argmax(axis=1)

def map_target(x):
    s = "NotFound"
    if x == 1:
        s = "Baa3"
    elif x == 2:
        s = "A3"
    elif x ==3:
        s = "A2"
    return s

test_party_df = pd.read_csv("test_moody.csv")
test_predictions = []
for pred in pred_test:
    test_predictions.append(map_target(pred))
test_party_df["pred_moody"] = test_predictions

assert sub_df.shape[0] == test_party_df.shape[0]

In [10]:
sub_df = sub_df.merge(test_party_df[["file_name","pred_moody"]], left_on="File_Name", right_on="file_name", how="left")
sub_df["RD_party_A_moody_long_term_threshold"] = sub_df["pred_moody"].values
sub_df = sub_df.drop(["file_name", "pred_moody"], axis=1)
sub_df.to_csv("baseline_with_date_mltt.csv", index=False)
sub_df.head()

### RD_party_A_moody_short_term_threshold

In [11]:
def map_target(x):
    s = 0
    if x == "P-1":
        s = 1
    return s
    
target = "RD_party_A_moody_short_term_threshold"
train_y = train_df[target].apply(lambda x: map_target(x))
train_group = train_party_df["file_name"].values

In [12]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=8, seed=0):
    params = {}
    params["objective"] = "binary"
    params['metric'] = 'auc'
    params["max_depth"] = dep
    params["min_data_in_leaf"] = 1
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed
    #params["lambda_l2"] = 0.01
    params["verbosity"] = -1
    num_rounds = 2000

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    #imps = model.feature_importance()
    #names = model.feature_name()
    #for fi, fn in enumerate(names):
    #    print(fn, imps[fi])

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [13]:
cv_scores = []
pred_test_full = 0
pred_train = np.zeros([train_X.shape[0]])
n_splits = 5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
#gkf = model_selection.GroupKFold(n_splits=n_splits)
model_name = "lgb"
for dev_index, val_index in skf.split(train_X, train_y):
    dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X)
    
    pred_train[val_index] = pred_v
    pred_test_full += (pred_t/float(n_splits))
    
    cv_scores.append(loss)
    print(cv_scores)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.985714
[200]	valid_0's auc: 0.9875
[300]	valid_0's auc: 0.9875
Early stopping, best iteration is:
[166]	valid_0's auc: 0.9875
0.9875
[0.9875]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.984762
[200]	valid_0's auc: 0.990476
Early stopping, best iteration is:
[2]	valid_0's auc: 0.994286
0.9904761904761905
[0.9875, 0.9904761904761905]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.994286
[200]	valid_0's auc: 0.992381
Early stopping, best iteration is:
[31]	valid_0's auc: 1
1.0
[0.9875, 0.9904761904761905, 1.0]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.996078
[200]	valid_0's auc: 0.996078
Early stopping, best iteration is:
[42]	valid_0's auc: 1
1.0
[0.9875, 0.9904761904761905, 1.0, 1.0]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.935294
[200

In [14]:
metrics.accuracy_score(train_y, pred_train>0.32)

0.9357429718875502

In [15]:
pred_test = (pred_test_full > 0.32).astype(int)

def map_target(x):
    s = "NotFound"
    if x == 1:
        s = "P-1"
    return s

test_party_df = pd.read_csv("test_moody.csv")
test_predictions = []
for pred in pred_test:
    test_predictions.append(map_target(pred))
test_party_df["pred_moody"] = test_predictions

assert sub_df.shape[0] == test_party_df.shape[0]

In [16]:
sub_df = sub_df.merge(test_party_df[["file_name","pred_moody"]], left_on="File_Name", right_on="file_name", how="left")
sub_df["RD_party_A_moody_short_term_threshold"] = sub_df["pred_moody"].values
sub_df = sub_df.drop(["file_name", "pred_moody"], axis=1)
sub_df.to_csv("baseline_with_date_mltt2.csv", index=False)
sub_df.head()

In [17]:
sub_df["RD_party_A_moody_short_term_threshold"].value_counts()

NotFound    41
P-1         20
Name: RD_party_A_moody_short_term_threshold, dtype: int64

### RD_party_A_moody_short_term_trigger_method

In [18]:
def map_target(x):
    s = 0
    if x == "below":
        s = 1
    return s
    
target = "RD_party_A_moody_short_term_trigger_method"
train_y = train_df[target].apply(lambda x: map_target(x))
train_group = train_party_df["file_name"].values

In [19]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=8, seed=0):
    params = {}
    params["objective"] = "binary"
    params['metric'] = 'auc'
    params["max_depth"] = dep
    params["min_data_in_leaf"] = 1
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed
    #params["lambda_l2"] = 0.01
    params["verbosity"] = -1
    num_rounds = 2000

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    #imps = model.feature_importance()
    #names = model.feature_name()
    #for fi, fn in enumerate(names):
    #    print(fn, imps[fi])

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [20]:
cv_scores = []
pred_test_full = 0
pred_train = np.zeros([train_X.shape[0]])
n_splits = 5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
#gkf = model_selection.GroupKFold(n_splits=n_splits)
model_name = "lgb"
for dev_index, val_index in skf.split(train_X, train_y):
    dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X)
    
    pred_train[val_index] = pred_v
    pred_test_full += (pred_t/float(n_splits))
    
    cv_scores.append(loss)
    print(cv_scores)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.974747
[200]	valid_0's auc: 0.978114
Early stopping, best iteration is:
[38]	valid_0's auc: 0.979798
0.9797979797979797
[0.9797979797979797]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.969697
[200]	valid_0's auc: 0.976827
[300]	valid_0's auc: 0.97861
[400]	valid_0's auc: 0.964349
[500]	valid_0's auc: 0.953654
Early stopping, best iteration is:
[311]	valid_0's auc: 0.980392
0.9803921568627451
[0.9797979797979797, 0.9803921568627451]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.948307
[200]	valid_0's auc: 0.946524
Early stopping, best iteration is:
[30]	valid_0's auc: 0.955437
0.9554367201426025
[0.9797979797979797, 0.9803921568627451, 0.9554367201426025]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.985294
[200]	valid_0's auc: 0.981618
Early stopping, best iteration is:
[38]	valid_

In [21]:
metrics.accuracy_score(train_y, pred_train>0.375)

0.9437751004016064

In [22]:
pd.Series(pred_test_full>0.375).value_counts()

False    39
True     22
dtype: int64

In [23]:
pred_test = (pred_test_full > 0.375).astype(int)

def map_target(x):
    s = "NotFound"
    if x == 1:
        s = "below"
    return s

test_party_df = pd.read_csv("test_moody.csv")
test_predictions = []
for pred in pred_test:
    test_predictions.append(map_target(pred))
test_party_df["pred_moody"] = test_predictions

assert sub_df.shape[0] == test_party_df.shape[0]

In [24]:
sub_df = sub_df.merge(test_party_df[["file_name","pred_moody"]], left_on="File_Name", right_on="file_name", how="left")
sub_df["RD_party_A_moody_short_term_trigger_method"] = sub_df["pred_moody"].values
sub_df = sub_df.drop(["file_name", "pred_moody"], axis=1)
sub_df.head()

Unnamed: 0,File_Name,date,party_A,party B,TC_currency,TC_Bespoke,calulation_agent,CA_fallback_default_dispute,CA_dispute_resolution,RD_method_party_A,RD_party_A_short_term_debt_classification,RD_party_A_moody_short_term_trigger_method,RD_party_A_moody_short_term_threshold,RD_party_A_SnP_short_term_trigger_method,RD_party_A_SnP_short_term_threshold,RD_party_A_Fitch_short_term_trigger_method,RD_party_A_Fitch_short_term_threshold,RD_party_A_long_term_debt_classification,RD_party_A_moody_long_term_trigger_method,RD_party_A_moody_long_term_threshold,RD_party_A_SnP_long_term_trigger_method,RD_party_A_SnP_long_term_threshold,RD_party_A_Fitch_long_term_trigger_method,RD_party_A_Fitch_long_term_threshold,Cross_default_threshold_type,Cross_default_percentage/amount,Cross_default_measure,multibranch_party_A,multibranch_party_B,governing law
0,20_bk,20040607,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,A3,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
1,43_bk,19970723,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
2,66_sh,19941221,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
3,ch5,20070830,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,A3,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
4,76_sh,20061212,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,below,P-1,NotFound,NotFound,NotFound,NotFound,NotFound,below,A2,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law


### RD_party_A_moody_long_term_trigger_method

In [25]:
def map_target(x):
    s = 0
    if x == "NotFound":
        s = 1
    elif x == "at":
        s = 2
    return s
    
target = "RD_party_A_moody_long_term_trigger_method"
train_y = train_df[target].apply(lambda x: map_target(x))
train_group = train_party_df["file_name"].values

In [26]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=8, seed=0):
    params = {}
    params["objective"] = "multiclass"
    params['metric'] = 'multi_logloss'
    params['num_class'] = 3
    params["max_depth"] = dep
    params["min_data_in_leaf"] = 1
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed
    #params["lambda_l2"] = 0.01
    params["verbosity"] = -1
    num_rounds = 2000

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    #imps = model.feature_importance()
    #names = model.feature_name()
    #for fi, fn in enumerate(names):
    #    print(fn, imps[fi])

    loss = 0
    if test_y is not None:
        loss = metrics.log_loss(test_y, pred_test_y, labels=[0,1,2])
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [27]:
cv_scores = []
pred_test_full = 0
pred_train = np.zeros([train_X.shape[0], 3])
n_splits = 5
skf = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
#gkf = model_selection.GroupKFold(n_splits=n_splits)
model_name = "lgb"
for dev_index, val_index in skf.split(train_X, train_y):
    dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X)
    
    pred_train[val_index] = pred_v
    pred_test_full += (pred_t/float(n_splits))
    
    cv_scores.append(loss)
    print(cv_scores)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's multi_logloss: 0.337152
[200]	valid_0's multi_logloss: 0.197706
[300]	valid_0's multi_logloss: 0.140687
[400]	valid_0's multi_logloss: 0.1107
[500]	valid_0's multi_logloss: 0.0990302
[600]	valid_0's multi_logloss: 0.0909704
[700]	valid_0's multi_logloss: 0.0827391
[800]	valid_0's multi_logloss: 0.0747786
[900]	valid_0's multi_logloss: 0.076104
[1000]	valid_0's multi_logloss: 0.0754051
[1100]	valid_0's multi_logloss: 0.0711801
[1200]	valid_0's multi_logloss: 0.0660894
[1300]	valid_0's multi_logloss: 0.0693735
Early stopping, best iteration is:
[1195]	valid_0's multi_logloss: 0.066
0.06599996273201475
[0.06599996273201475]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's multi_logloss: 0.424626
[200]	valid_0's multi_logloss: 0.311853
[300]	valid_0's multi_logloss: 0.272027
[400]	valid_0's multi_logloss: 0.274842
[500]	valid_0's multi_logloss: 0.28339
Early stopping, best iteration

In [28]:
metrics.accuracy_score(train_y, pred_train.argmax(axis=1))

0.9156626506024096

In [29]:
pred_test = pred_test_full.argmax(axis=1)

def map_target(x):
    s = "below"
    if x == 1:
        s = "NotFound"
    elif x == 2:
        s = "at"
    return s

test_party_df = pd.read_csv("test_moody.csv")
test_predictions = []
for pred in pred_test:
    test_predictions.append(map_target(pred))
test_party_df["pred_moody"] = test_predictions

assert sub_df.shape[0] == test_party_df.shape[0]

pd.Series(pred_test).value_counts()

0    49
1     7
2     5
dtype: int64

In [30]:
sub_df = sub_df.merge(test_party_df[["file_name","pred_moody"]], left_on="File_Name", right_on="file_name", how="left")
sub_df["RD_party_A_moody_long_term_trigger_method"] = sub_df["pred_moody"].values
sub_df = sub_df.drop(["file_name", "pred_moody"], axis=1)
# sub_df.to_csv("baseline_with_date_mltt3.csv", index=False)
# sub_df.head()

Unnamed: 0,File_Name,date,party_A,party B,TC_currency,TC_Bespoke,calulation_agent,CA_fallback_default_dispute,CA_dispute_resolution,RD_method_party_A,RD_party_A_short_term_debt_classification,RD_party_A_moody_short_term_trigger_method,RD_party_A_moody_short_term_threshold,RD_party_A_SnP_short_term_trigger_method,RD_party_A_SnP_short_term_threshold,RD_party_A_Fitch_short_term_trigger_method,RD_party_A_Fitch_short_term_threshold,RD_party_A_long_term_debt_classification,RD_party_A_moody_long_term_trigger_method,RD_party_A_moody_long_term_threshold,RD_party_A_SnP_long_term_trigger_method,RD_party_A_SnP_long_term_threshold,RD_party_A_Fitch_long_term_trigger_method,RD_party_A_Fitch_long_term_threshold,Cross_default_threshold_type,Cross_default_percentage/amount,Cross_default_measure,multibranch_party_A,multibranch_party_B,governing law
0,20_bk,20040607,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,A3,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
1,43_bk,19970723,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
2,66_sh,19941221,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
3,ch5,20070830,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,A3,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
4,76_sh,20061212,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,below,P-1,NotFound,NotFound,NotFound,NotFound,NotFound,below,A2,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
