In [1]:
import os
import string
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import lxml
import en_core_web_sm
import lightgbm as lgb

from sklearn import preprocessing, model_selection, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

nlp = en_core_web_sm.load()

pd.options.display.max_columns = 1000

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train_df = pd.read_csv("../input/train_labels.csv")
sub_df = pd.read_csv("../input/baseline.csv") ### input the sample sub file

In [3]:
sub_df.head()

Unnamed: 0,File_Name,date,party_A,party B,TC_currency,TC_Bespoke,calulation_agent,CA_fallback_default_dispute,CA_dispute_resolution,RD_method_party_A,RD_party_A_short_term_debt_classification,RD_party_A_moody_short_term_trigger_method,RD_party_A_moody_short_term_threshold,RD_party_A_SnP_short_term_trigger_method,RD_party_A_SnP_short_term_threshold,RD_party_A_Fitch_short_term_trigger_method,RD_party_A_Fitch_short_term_threshold,RD_party_A_long_term_debt_classification,RD_party_A_moody_long_term_trigger_method,RD_party_A_moody_long_term_threshold,RD_party_A_SnP_long_term_trigger_method,RD_party_A_SnP_long_term_threshold,RD_party_A_Fitch_long_term_trigger_method,RD_party_A_Fitch_long_term_threshold,Cross_default_threshold_type,Cross_default_percentage/amount,Cross_default_measure,multibranch_party_A,multibranch_party_B,governing law
0,20_bk,,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
1,43_bk,,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
2,66_sh,,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
3,ch5,,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
4,76_sh,,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law


In [4]:
xml_file_names = []
for file_name in os.listdir("../input/data/"):
    if file_name.endswith(".xml"):
        xml_file_names.append(file_name)
len(xml_file_names)

249

### Date Extraction ###

In [5]:
def get_date(doc):
    """ Use spacy NER to extract the date """
    doc = nlp(doc)
    for ent in doc.ents:
        if ent.label_ == "DATE":
            return ent.text
    else:
        return None
    
def parse_dates(date_str):
    date_str = date_str.replace(","," ").split()
    month_list = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
    month = "00"
    year = "0000"
    day = "00"
    for s in date_str:
        for ind, month_name in enumerate(month_list):
            if s[:3].lower() in month_name:
                month = str(ind+1).zfill(2)
        if s.isnumeric() and len(s) == 4:
            year = str(s)
        elif s.isnumeric() and (len(s) == 2 or len(s)==1):
            day = str(s).zfill(2)
        elif s.replace('st','').replace('th','').replace('rd', '').isnumeric():
            day = s.replace('st','').replace('th','').replace('rd', '')
            day = str(day[:2]).zfill(2)
    return [year,month,day]

def get_best_date(dates_list):
    bst_cnt = 0
    bst_date = "00000000"
    for d in dates_list:
        cnt_found = 0
        if d[0] != "0000":
            cnt_found += 1
        if d[1] != "00":
            cnt_found += 1
        if d[2] != "00":
            cnt_found += 1
        if cnt_found > bst_cnt:
            bst_cnt = cnt_found
            bst_date = d
    return "".join(bst_date)

def get_date_field(file_name, data_path="../input/data/"):
    print(file_name)
    dates_found = []
    with open(data_path+file_name+".xml") as fname:
        xml_text = fname.read()
        soup = BeautifulSoup(xml_text)

        all_blocks = soup.find_all("block")
        for ind, block in enumerate(all_blocks):
            try:
                prev_block_text = all_blocks[ind-1].get_text().strip()
            except:
                prev_block_text = "None"
            block_text = block.get_text().strip()
            try:
                next_block_text = all_blocks[ind+1].get_text().strip()
            except:
                next_block_text = "None"
            if "dated as of" in block_text.lower(): # and len(block_text.split()) < 10:
                #print(block_text)
                date_ext = get_date(block_text)
                if date_ext is not None:
                    dates_found.append(date_ext)
                else:
                    date_ext = get_date(prev_block_text)
                    if date_ext is not None:
                        dates_found.append(date_ext)
                    date_ext = get_date(next_block_text)
                    if date_ext is not None:
                        dates_found.append(date_ext)
    #print(dates_found)
    dates_found = ([parse_dates(d) for d in dates_found])
    bst_date =  get_best_date(dates_found)
    return bst_date
                
#for file_name in xml_file_names:
#    get_date_field(file_name)
#    print("\n")

def get_score(act, pred):
    sc = 0.
    if act[:4] == pred[:4]:
        sc += 0.4
    if act[4:6] == pred[4:6]:
        sc += 0.3 
    if act[6:] == pred[6:]:
        sc += 0.3
    return sc

# score_sum = 0
# score_count = 0.
# for ind, row in train_df.iterrows():
#     file_name = row["File_Name"]
#     act_date = str(row["date"])
#     pred_date = get_date_field(file_name, data_path="../input/data/")
#     print(act_date, pred_date)
#     score = get_score(act_date, pred_date)
#     score_sum += score
#     score_count += 1

pred_dates = []
for ind, row in sub_df.iterrows():
    file_name = row["File_Name"]
    pred_date = get_date_field(file_name, data_path="../input/public_test_data/")
    if pred_date == "00000000":
        pred_date = "20050313"
    elif pred_date[:4] == "0000":
        pred_date = pred_date.replace("0000", "2005")
    print(pred_date)
    pred_dates.append(pred_date)
sub_df["date"] = pred_dates

sub_df.to_csv("baseline_with_dates.csv", index=False)



20_bk
20040607
43_bk
19970723
66_sh
19941221
ch5
20070830
76_sh
20061212
84_bk
19990915
41_bk
20020313
85_sh
20041104
89_sh
20020723
112_bk
20040400
30_bk
20030807
18_bk
20020725
65_bk
19950515
14_bk
20040305
48_sh
20170613
sh_137
20050106
sr33
20050313
44_sh
20131211
39_sh
20130318
9_bk
20040129
10_bk
20040211
4_sh
20050313
45_bk
20050313
rk_15
20051121
26_bk
20050313
63_bk
19950721
91_sh
20071219
36_bk
19950508
55_bk
20070000
al_7
20070613
sh_132
20050313
7_bk
20040119
95_bk
20000307
gsss_4
20050000
79_sh
20060000
2_bk
20021106
74_bk
19970923
111_bk
20050401
33_bk
20080313
42_bk
20030404
98_sh
20000500
gsss_2
20050313
40_bk
19970407
gsss_3
20000100
31_bk
20030804
gsss_1
20010213
77_sh
20061212
35_bk
19990426
48_bk
20050313
56_bk
20030128
sh_149
20001206
11_bk
20031112
GN17
20030429
6_bk
20050313
gs_8
20011018
107_bk
20010201
82_sh
20030131
29_bk
20050900
54_bk
20051021
115_bk
20061231
7_sh
20041119


In [6]:
# print(score_sum / score_count)

### Party A 

In [6]:
import string

def match_names(block_text, party):
    match = 0
    party = party.lower()
    block_text = block_text.lower()
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    party = party.translate(translator)
    block_text = block_text.translate(translator)
    if party in block_text:
        match = 1
    return match
    
def prepare_party_data(row, party_col="party_A", data_path="../input/data/", train_flag=1):
    file_name = row["File_Name"]
    if train_flag:
        party = row[party_col]
    else:
        party = "RandomPartyMention"
    features = []
    with open(data_path+file_name+".xml") as fname:
        xml_text = fname.read()
        soup = BeautifulSoup(xml_text)

        all_blocks = soup.find_all("block")
        for ind, block in enumerate(all_blocks):
            try:
                prev_block_text = all_blocks[ind-1].get_text().strip()
            except:
                prev_block_text = "None"
            block_text = block.get_text().strip()
            try:
                next_block_text = all_blocks[ind+1].get_text().strip()
            except:
                next_block_text = "None"
                
            if len(block_text.split()) < 12:
                #if party.lower() in block_text.lower():
                if match_names(block_text, party):
                    target = 1
                else:
                    target = 0
                features.append([file_name, prev_block_text, block_text, next_block_text, party, target])
    return features

full_features = []
for ind, row in train_df.iterrows():
    features = prepare_party_data(row, party_col="party_A", data_path="../input/data/")
    full_features.extend(features)
full_features = pd.DataFrame(full_features)
full_features.columns = ["file_name", "prev_text", "text", "next_text", "Party A", "target"]
full_features.to_csv("train_party_a.csv", index=False)
full_features.head()

full_features = []
for ind, row in sub_df.iterrows():
    features = prepare_party_data(row, party_col="party_A", data_path="../input/public_test_data/", train_flag=0)
    full_features.extend(features)
full_features = pd.DataFrame(full_features)
full_features.columns = ["file_name", "prev_text", "text", "next_text", "Party A", "target"]
full_features.to_csv("test_party_a.csv", index=False)

In [7]:
full_features["target"].value_counts()

0    17474
Name: target, dtype: int64

In [7]:
train_party_df = pd.read_csv("train_party_a.csv")
test_party_df = pd.read_csv("test_party_a.csv")
train_y = train_party_df["target"].values
train_group = train_party_df["file_name"].values

full_df = pd.concat([train_party_df, test_party_df], axis=0)

n_components = 15
train_X = []
test_X = []
# Generate text features:
for i in ["prev_text", "text", "next_text"]:
    # Initialize decomposition methods:
    print(f'generating features from: {i}')
    tfv = TfidfVectorizer(max_features=None, ngram_range=(1, 3))
    svd_ = TruncatedSVD(n_components=n_components, random_state=1337)
    
    tfidf_col = tfv.fit_transform(full_df[i].astype(str).values)
    svd_.fit(tfidf_col)
    
    tfidf_col = tfv.transform(train_party_df[i].astype(str).values)
    svd_col = svd_.transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    train_X.append(svd_col)
    
    tfidf_col = tfv.transform(test_party_df[i].astype(str).values)
    svd_col = svd_.transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    test_X.append(svd_col)
    
train_X = pd.concat(train_X, axis=1)
test_X = pd.concat(test_X, axis=1)

generating features from: prev_text
generating features from: text
generating features from: next_text


In [10]:
train_X.shape

(70337, 45)

In [7]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=8, seed=0):
    params = {}
    params["objective"] = "binary"
    params['metric'] = 'auc'
    params["max_depth"] = dep
    params["min_data_in_leaf"] = 1
    params["learning_rate"] = 0.01
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed
    #params["lambda_l2"] = 0.01
    params["verbosity"] = -1
    num_rounds = 2000

    plst = list(params.items())
    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    #imps = model.feature_importance()
    #names = model.feature_name()
    #for fi, fn in enumerate(names):
    #    print(fn, imps[fi])

    loss = 0
    if test_y is not None:
        loss = metrics.roc_auc_score(test_y, pred_test_y)
        print(loss)
        return model, loss, pred_test_y, pred_test_y2
    else:
        return model, loss, pred_test_y, pred_test_y2

In [10]:
cv_scores = []
pred_test_full = 0
pred_train = np.zeros(train_X.shape[0])
n_splits = 5
#skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
gkf = model_selection.GroupKFold(n_splits=n_splits)
model_name = "lgb"
for dev_index, val_index in gkf.split(train_X, train_y, train_group):
    dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X)
    
    pred_train[val_index] = pred_v
    pred_test_full += (pred_t/float(n_splits))
    
    cv_scores.append(loss)
    print(cv_scores)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.996016
[200]	valid_0's auc: 0.996114
[300]	valid_0's auc: 0.995805
Early stopping, best iteration is:
[135]	valid_0's auc: 0.996654
0.9966540060197793
[0.9966540060197793]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.992858
[200]	valid_0's auc: 0.992843
[300]	valid_0's auc: 0.993147
[400]	valid_0's auc: 0.992986
[500]	valid_0's auc: 0.992419
Early stopping, best iteration is:
[312]	valid_0's auc: 0.993196
0.9931956442004882
[0.9966540060197793, 0.9931956442004882]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.997276
[200]	valid_0's auc: 0.99758
[300]	valid_0's auc: 0.997613
[400]	valid_0's auc: 0.997706
[500]	valid_0's auc: 0.997711
[600]	valid_0's auc: 0.99768
Early stopping, best iteration is:
[424]	valid_0's auc: 0.997725
0.9977246331066649
[0.9966540060197793, 0.9931956442004882, 0.9977246331066649]
Training until val

In [36]:
# train_party_df["pred"] = pred_train
# gdf = train_party_df.groupby("file_name")["pred"].max().reset_index()
# gdf = pd.merge(gdf, train_party_df, on=["file_name", "pred"])

In [42]:
# entity_names = []
# for ind, row in gdf.iterrows():
#     txt = row["text"]
#     txt = txt.replace("Party A", "").replace("Party B","").replace("Party","").replace("PARTY","").replace("(","").replace(")","")
#     doc = nlp(txt)
#     #print(row["text"])
#     #print("----")
#     entity = None
#     for ent in doc.ents:
#         if ent.label_ == "ORG":
#             entity = ent.text.strip()
#             break
#     if entity is None:
#         entity = txt.strip()
#     entity_names.append(entity)
    
# gdf["pred_party_a"] = entity_names

In [11]:
test_party_df["pred"] = pred_test_full
gdf = test_party_df.groupby("file_name")["pred"].max().reset_index()
gdf = pd.merge(gdf, test_party_df, on=["file_name", "pred"])

In [12]:
entity_names = []
for ind, row in gdf.iterrows():
    txt = row["text"]
    txt = txt.replace("Party A", "").replace("Party B","").replace("Party","").replace("PARTY","").replace("(","").replace(")","")
    doc = nlp(txt)
    #print(row["text"])
    #print("----")
    entity = None
    for ent in doc.ents:
        if ent.label_ == "ORG":
            entity = ent.text.strip()
            break
    if entity is None:
        entity = txt.strip()
    entity = entity.replace('“','').replace('"','')
    entity_names.append(entity)
    
gdf["pred_party_a"] = entity_names
gdf.head()

Unnamed: 0,file_name,pred,prev_text,text,next_text,Party A,target,pred_party_a
0,107_bk,0.900833,SCHEDULE\nto the\n\nMaster Agreement\ndated as...,SOCIETE GENERALE (“Party A”)\n\n(whose Head Of...,Between:,RandomPartyMention,0,SOCIETE GENERALE “
1,10_bk,0.893482,"discharge (i.e., where the relevant Currency O...",SOCIETE GENERALE (Party A),PANDORA HOLDINGS LIMITED (Party,RandomPartyMention,0,SOCIETE GENERALE
2,111_bk,0.845458,dated as of,Societe Generale,KASHI LIMITED,RandomPartyMention,0,Societe Generale
3,112_bk,0.847673,dated as of,Societe Generale,SEB LIMITED,RandomPartyMention,0,Societe Generale
4,115_bk,0.89773,value of that which was (or would have been) r...,Party A\n\nSOCIETE GENERALE,"Party B\nABCXYZ\n\nINTEftNATIONAJJMASTER, L.P.",RandomPartyMention,0,SOCIETE GENERALE


In [13]:
assert gdf.shape[0] == sub_df.shape[0]

sub_df = pd.merge(sub_df, gdf[["file_name", "pred_party_a"]], left_on="File_Name", right_on="file_name", how="left")
sub_df["party_A"] = sub_df["pred_party_a"].values
sub_df = sub_df.drop(["file_name", "pred_party_a"],axis=1)
sub_df.head()

Unnamed: 0,File_Name,date,party_A,party B,TC_currency,TC_Bespoke,calulation_agent,CA_fallback_default_dispute,CA_dispute_resolution,RD_method_party_A,RD_party_A_short_term_debt_classification,RD_party_A_moody_short_term_trigger_method,RD_party_A_moody_short_term_threshold,RD_party_A_SnP_short_term_trigger_method,RD_party_A_SnP_short_term_threshold,RD_party_A_Fitch_short_term_trigger_method,RD_party_A_Fitch_short_term_threshold,RD_party_A_long_term_debt_classification,RD_party_A_moody_long_term_trigger_method,RD_party_A_moody_long_term_threshold,RD_party_A_SnP_long_term_trigger_method,RD_party_A_SnP_long_term_threshold,RD_party_A_Fitch_long_term_trigger_method,RD_party_A_Fitch_long_term_threshold,Cross_default_threshold_type,Cross_default_percentage/amount,Cross_default_measure,multibranch_party_A,multibranch_party_B,governing law
0,20_bk,20040607,SOCIETE GENERALE “,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
1,43_bk,19970723,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
2,66_sh,19941221,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
3,ch5,20070830,SOCIETE GENERALE,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
4,76_sh,20061212,SOCIETE GENERALE “,Societe Generale,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law


### Party B

In [8]:
# full_features = []
# for ind, row in train_df.iterrows():
#     features = prepare_party_data(row, party_col="party B", data_path="../input/data/")
#     full_features.extend(features)
# full_features = pd.DataFrame(full_features)
# full_features.columns = ["file_name", "prev_text", "text", "next_text", "Party B", "target"]
# full_features.to_csv("train_party_b.csv", index=False)
# full_features.head()

full_features = []
for ind, row in sub_df.iterrows():
    features = prepare_party_data(row, party_col="party B", data_path="../input/public_test_data/", train_flag=0)
    full_features.extend(features)
full_features = pd.DataFrame(full_features)
full_features.columns = ["file_name", "prev_text", "text", "next_text", "Party B", "target"]
full_features.to_csv("test_party_b.csv", index=False)

In [9]:
train_party_df = pd.read_csv("train_party_b.csv")
test_party_df = pd.read_csv("test_party_b.csv")
train_y = train_party_df["target"].values
train_group = train_party_df["file_name"].values

full_df = pd.concat([train_party_df, test_party_df], axis=0)

n_components = 15
train_X = []
test_X = []
# Generate text features:
for i in ["prev_text", "text", "next_text"]:
    # Initialize decomposition methods:
    print(f'generating features from: {i}')
    tfv = TfidfVectorizer(max_features=None, ngram_range=(1, 3))
    svd_ = TruncatedSVD(n_components=n_components, random_state=1337)
    
    tfidf_col = tfv.fit_transform(full_df[i].astype(str).values)
    svd_.fit(tfidf_col)
    
    tfidf_col = tfv.transform(train_party_df[i].astype(str).values)
    svd_col = svd_.transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    train_X.append(svd_col)
    
    tfidf_col = tfv.transform(test_party_df[i].astype(str).values)
    svd_col = svd_.transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    test_X.append(svd_col)
    
train_X = pd.concat(train_X, axis=1)
test_X = pd.concat(test_X, axis=1)

generating features from: prev_text
generating features from: text
generating features from: next_text


In [10]:
cv_scores = []
pred_test_full = 0
pred_train = np.zeros(train_X.shape[0])
n_splits = 5
#skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
gkf = model_selection.GroupKFold(n_splits=n_splits)
model_name = "lgb"
for dev_index, val_index in gkf.split(train_X, train_y, train_group):
    dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    
    model, loss, pred_v, pred_t = runLGB(dev_X, dev_y, val_X, val_y, test_X)
    
    pred_train[val_index] = pred_v
    pred_test_full += (pred_t/float(n_splits))
    
    cv_scores.append(loss)
    print(cv_scores)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.989766
[200]	valid_0's auc: 0.989384
[300]	valid_0's auc: 0.989033
Early stopping, best iteration is:
[121]	valid_0's auc: 0.989922
0.9899217612339988
[0.9899217612339988]
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.989821
[200]	valid_0's auc: 0.992941
[300]	valid_0's auc: 0.993156
[400]	valid_0's auc: 0.993469
[500]	valid_0's auc: 0.993808
[600]	valid_0's auc: 0.994006
[700]	valid_0's auc: 0.994062
[800]	valid_0's auc: 0.994214
[900]	valid_0's auc: 0.994266
[1000]	valid_0's auc: 0.994312
[1100]	valid_0's auc: 0.9943
[1200]	valid_0's auc: 0.994376
[1300]	valid_0's auc: 0.994404
[1400]	valid_0's auc: 0.994413
[1500]	valid_0's auc: 0.994398
[1600]	valid_0's auc: 0.994487
[1700]	valid_0's auc: 0.994437
[1800]	valid_0's auc: 0.994484
[1900]	valid_0's auc: 0.994488
[2000]	valid_0's auc: 0.994528
Did not meet early stopping. Best iteration is:
[1995]	valid_0's auc: 

In [54]:
# train_party_df["pred"] = pred_train
# gdf = train_party_df.groupby("file_name")["pred"].max().reset_index()
# gdf = pd.merge(gdf, train_party_df, on=["file_name", "pred"])

In [55]:
# entity_names = []
# for ind, row in gdf.iterrows():
#     txt = row["text"]
#     txt = txt.replace("Party A", "").replace("Party B","").replace("Party","").replace("PARTY","").replace("(","").replace(")","")
#     doc = nlp(txt)
#     #print(row["text"])
#     #print("----")
#     entity = None
#     for ent in doc.ents:
#         if ent.label_ == "ORG":
#             entity = ent.text.strip()
#             break
#     if entity is None:
#         entity = txt.strip()
#     entity_names.append(entity)
    
# gdf["pred_party_b"] = entity_names


In [11]:
test_party_df["pred"] = pred_test_full
gdf = test_party_df.groupby("file_name")["pred"].max().reset_index()
gdf = pd.merge(gdf, test_party_df, on=["file_name", "pred"])

In [12]:
entity_names = []
for ind, row in gdf.iterrows():
    txt = row["text"]
    txt = txt.replace("Party A", "").replace("Party B","").replace("Party","").replace("PARTY","").replace("(","").replace(")","")
    doc = nlp(txt)
    #print(row["text"])
    #print("----")
    entity = None
    for ent in doc.ents:
        if ent.label_ == "ORG":
            entity = ent.text.strip()
            break
    if entity is None:
        entity = txt.strip()
    entity = entity.replace('“','').replace('"','')
    entity_names.append(entity)
    
gdf["pred_party_b"] = entity_names

In [13]:
assert gdf.shape[0] == sub_df.shape[0]

sub_df = pd.merge(sub_df, gdf[["file_name", "pred_party_b"]], left_on="File_Name", right_on="file_name", how="left")
sub_df["party B"] = sub_df["pred_party_b"].values
sub_df = sub_df.drop(["file_name", "pred_party_b"],axis=1)
sub_df.head(20)

Unnamed: 0,File_Name,date,party_A,party B,TC_currency,TC_Bespoke,calulation_agent,CA_fallback_default_dispute,CA_dispute_resolution,RD_method_party_A,RD_party_A_short_term_debt_classification,RD_party_A_moody_short_term_trigger_method,RD_party_A_moody_short_term_threshold,RD_party_A_SnP_short_term_trigger_method,RD_party_A_SnP_short_term_threshold,RD_party_A_Fitch_short_term_trigger_method,RD_party_A_Fitch_short_term_threshold,RD_party_A_long_term_debt_classification,RD_party_A_moody_long_term_trigger_method,RD_party_A_moody_long_term_threshold,RD_party_A_SnP_long_term_trigger_method,RD_party_A_SnP_long_term_threshold,RD_party_A_Fitch_long_term_trigger_method,RD_party_A_Fitch_long_term_threshold,Cross_default_threshold_type,Cross_default_percentage/amount,Cross_default_measure,multibranch_party_A,multibranch_party_B,governing law
0,20_bk,20040607,SOCIETE GENERALE,BARRY ALLEN PUMPS,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
1,43_bk,19970723,SOCIETE GENERALE,BANK OF VADARA,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
2,66_sh,19941221,SOCIETE GENERALE,Bank of ABC,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
3,ch5,20070830,SOCIETE GENERALE,ASSET,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
4,76_sh,20061212,SOCIETE GENERALE,EMPTY FUND BANK LTD,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
5,84_bk,19990915,SOCIETE GENERALE,VEHEMENT CAPITAL,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
6,41_bk,20020313,SOCIETE GENERALE,MMMMM PRIVATE LTD,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
7,85_sh,20041104,SOCIETE GENERALE,Airports Authority of India,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
8,89_sh,20020723,SOCIETE GENERALE,A.C.N.,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law
9,112_bk,20040400,SOCIETE GENERALE,SEB LIMITED,USD,To be selected by the Non-defaulting party or ...,Party A,NotFound,No,1,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,NotFound,below,NotFound,below,NotFound,NotFound,NotFound,Fixed Amount,10000000,NotFound,worldwide,No,English Law


In [14]:
sub_df.to_csv("baseline_with_dateB.csv", index=False)

In [1]:
import editdistance