In [2]:
import os
import string
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import lxml
import en_core_web_sm
import lightgbm as lgb

from sklearn import preprocessing, model_selection, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

nlp = en_core_web_sm.load()

pd.options.display.max_columns = 1000

In [3]:
train_df = pd.read_csv("../input/train_labels.csv")
sub_df = pd.read_csv("../input/baseline_with_date_mltt2_snp.csv") ### input the sample sub file

In [4]:
sub_df.head()

Unnamed: 0,File_Name,date,party_A,party B,TC_currency,TC_Bespoke,calulation_agent,CA_fallback_default_dispute,CA_dispute_resolution,RD_method_party_A,RD_party_A_short_term_debt_classification,RD_party_A_moody_short_term_trigger_method,RD_party_A_moody_short_term_threshold,RD_party_A_SnP_short_term_trigger_method,RD_party_A_SnP_short_term_threshold,RD_party_A_Fitch_short_term_trigger_method,RD_party_A_Fitch_short_term_threshold,RD_party_A_long_term_debt_classification,RD_party_A_moody_long_term_trigger_method,RD_party_A_moody_long_term_threshold,RD_party_A_SnP_long_term_trigger_method,RD_party_A_SnP_long_term_threshold,RD_party_A_Fitch_long_term_trigger_method,RD_party_A_Fitch_long_term_threshold,Cross_default_threshold_type,Cross_default_percentage/amount,Cross_default_measure,multibranch_party_A,multibranch_party_B,governing law
0,20_bk,20040607,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,A3,below,A-,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
1,43_bk,19970723,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
2,66_sh,19941221,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
3,ch5,20070830,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,A3,below,A-,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
4,76_sh,20061212,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,P-1,NotFound,A1,NotFound,NotFound,NotFound,below,A2,below,AA-,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law


In [6]:
def prepare_data(row, col="TC_Bespoke", keyword="termination currency", data_path="../data/train_data/", train_flag=1):
    file_name = row["File_Name"]
    if train_flag:
        party = row[col]
    else:
        party = 0
    features = []
    with open(data_path + file_name + ".xml", encoding='utf-8') as fname:
        xml_text = fname.read()
        soup = BeautifulSoup(xml_text)

        full_txt = ""
        all_blocks = soup.find_all("block")
        for ind, block in enumerate(all_blocks):
            try:
                prev_block_text = all_blocks[ind-1].get_text().strip()
            except:
                prev_block_text = "None"
            block_text = block.get_text().strip()
            try:
                next_block_text = all_blocks[ind+1].get_text().strip()
            except:
                next_block_text = "None"
                
            if keyword in block_text.lower():
                full_txt = full_txt + " " + block_text.lower()
                #if party.lower() in block_text.lower():
        target = party
        features.append([file_name, full_txt, target])
    return features

full_features = []
for ind, row in train_df.iterrows():
    features = prepare_data(row, col="TC_Bespoke", data_path="../data/train_data/")
    full_features.extend(features)
full_features = pd.DataFrame(full_features)
full_features.columns = ["file_name", "text","target"]
full_features.to_csv("train_tcbespoke.csv", index=False)
full_features.head()

full_features = []
for ind, row in sub_df.iterrows():
    features = prepare_data(row, col="TC_Bespoke", data_path="../data/test_data/", train_flag=0)
    full_features.extend(features)
full_features = pd.DataFrame(full_features)
full_features.columns = ["file_name", "text", "target"]
full_features.to_csv("test_tcbespoke.csv", index=False)

In [7]:
full_features.head()

Unnamed: 0,file_name,text,target
0,20_bk,value of that which was (or would have been) ...,0
1,43_bk,"""termination currency"" means the currency sel...",0
2,66_sh,"""termination currency"" means united states do...",0
3,ch5,value of that which was (or would have been) ...,0
4,76_sh,value of that which was (or would have been) ...,0


In [8]:
train_party_df = pd.read_csv("train_tcbespoke.csv")
test_party_df = pd.read_csv("test_tcbespoke.csv")
map_dict = {"To be selected by the Non-defaulting party or non-affected party":0, "NotFound":1}
train_y = train_party_df["target"].map(map_dict)
train_group = train_party_df["file_name"].values

full_df = pd.concat([train_party_df, test_party_df], axis=0)

n_components = 12
train_X = []
test_X = []
# Generate text features:
for i in ["text"]:
    # Initialize decomposition methods:
    print(f'generating features from: {i}')
    tfv = TfidfVectorizer(max_features=None, ngram_range=(1, 3))
    svd_ = TruncatedSVD(n_components=n_components, random_state=1337)
    
    tfidf_col = tfv.fit_transform(full_df[i].astype(str).values)
    svd_.fit(tfidf_col)
    
    tfidf_col = tfv.transform(train_party_df[i].astype(str).values)
    svd_col = svd_.transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    train_X.append(svd_col)
    
    tfidf_col = tfv.transform(test_party_df[i].astype(str).values)
    svd_col = svd_.transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    test_X.append(svd_col)
    
train_X = pd.concat(train_X, axis=1)
test_X = pd.concat(test_X, axis=1)

generating features from: text


In [19]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=8, seed=0):
    params = {}
    params["objective"] = "binary"
    params['metric'] = 'auc'
    params["max_depth"] = dep
    params["min_data_in_leaf"] = 1
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed
    #params["lambda_l2"] = 0.01
    params["verbosity"] = -1
    num_rounds = 2000

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    #imps = model.feature_importance()
    #names = model.feature_name()
    #for fi, fn in enumerate(names):
    #    print(fn, imps[fi])
    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [23]:
cv_scores = []
pred_test_full = 0
pred_train = np.zeros(train_X.shape[0])
n_splits = 5
#skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
gkf = model_selection.GroupKFold(n_splits=n_splits)
model_name = "lgb"
for dev_index, val_index in gkf.split(train_X, train_y, train_group):
    dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X)
    
    pred_train[val_index] = pred_v
    pred_test_full += (pred_t/float(n_splits))
    
    cv_scores.append(loss)
    print(cv_scores)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.994286
[200]	valid_0's auc: 0.99619
Early stopping, best iteration is:
[27]	valid_0's auc: 0.998095
0.998095238095238
[0.998095238095238]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.94
[200]	valid_0's auc: 0.941667
Early stopping, best iteration is:
[8]	valid_0's auc: 0.950833
0.9508333333333334
[0.998095238095238, 0.9508333333333334]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.99026
[200]	valid_0's auc: 0.993506
[300]	valid_0's auc: 0.993506
[400]	valid_0's auc: 0.991883
[500]	valid_0's auc: 0.99026
Early stopping, best iteration is:
[321]	valid_0's auc: 0.99513
0.9951298701298701
[0.998095238095238, 0.9508333333333334, 0.9951298701298701]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.981481
[200]	valid_0's auc: 0.979871
Early stopping, best iteration is:
[10]	valid_0's auc: 0.9

In [24]:
test_bespoke = []
for pred in pred_test_full:
    if pred <= 0.49:
        test_bespoke.append("To be selected by the Non-defaulting party or non-affected party")
    else:
        test_bespoke.append("NotFound")
test_party_df["pred_tc_bespoke"] = test_bespoke

assert sub_df.shape[0] == test_party_df.shape[0]

In [25]:
sub_df = sub_df.merge(test_party_df[["file_name","pred_tc_bespoke"]], left_on="File_Name", right_on="file_name", how="left")
sub_df["TC_Bespoke"] = sub_df["pred_tc_bespoke"].values
sub_df = sub_df.drop(["file_name", "pred_tc_bespoke"], axis=1)
sub_df.to_csv("baseline_with_date_bespoke.csv", index=False)
sub_df.head()

Unnamed: 0,File_Name,date,party_A,party B,TC_currency,TC_Bespoke,calulation_agent,CA_fallback_default_dispute,CA_dispute_resolution,RD_method_party_A,RD_party_A_short_term_debt_classification,RD_party_A_moody_short_term_trigger_method,RD_party_A_moody_short_term_threshold,RD_party_A_SnP_short_term_trigger_method,RD_party_A_SnP_short_term_threshold,RD_party_A_Fitch_short_term_trigger_method,RD_party_A_Fitch_short_term_threshold,RD_party_A_long_term_debt_classification,RD_party_A_moody_long_term_trigger_method,RD_party_A_moody_long_term_threshold,RD_party_A_SnP_long_term_trigger_method,RD_party_A_SnP_long_term_threshold,RD_party_A_Fitch_long_term_trigger_method,RD_party_A_Fitch_long_term_threshold,Cross_default_threshold_type,Cross_default_percentage/amount,Cross_default_measure,multibranch_party_A,multibranch_party_B,governing law
0,20_bk,20040607,SOCIETE GENERALE,Societe Generale,USD,NotFound,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,A3,below,A-,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
1,43_bk,19970723,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
2,66_sh,19941221,SOCIETE GENERALE,Societe Generale,USD,NotFound,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
3,ch5,20070830,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,A3,below,A-,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
4,76_sh,20061212,SOCIETE GENERALE,Societe Generale,USD,NotFound,Party A,NotFound,No,1,NotFound,NotFound,P-1,NotFound,A1,NotFound,NotFound,NotFound,below,A2,below,AA-,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
