In [1]:
import random
import os
from collections import Counter, defaultdict
import sys

import pandas as pd
import numpy as np

In [2]:
%cd /Users/johannesbreit/Library/CloudStorage/OneDrive-UniversityCollegeLondon/COMP0087/project/nlpproject

/Users/johannesbreit/Library/CloudStorage/OneDrive-UniversityCollegeLondon/COMP0087/project/nlpproject


In [3]:
corpus_in = pd.read_csv('../nlpproject/nlp/data/example_inputs_pos_and_neg.tsv.gz', compression='gzip', header=0, sep='\t')

#Remove any \n from the rows 
corpus_in.replace("\n","",inplace=True)
corpus_in.replace('(\n)','',regex=True,inplace=True)

#Create sentences
df_data_sentences = corpus_in[['left', 'entity1', "middle",'entity2','right']].astype(str).agg(' '.join, axis=1)
corpus_in['sentences'] = df_data_sentences.values

corpus_in.head()

Unnamed: 0,entity1,entity2,left,middle,right,article,label,sentences
0,Apple Inc,Samsung Electronics Co Ltd,These reviews have been especially popular wit...,(AAPL.O) and,(005930.KS) frequently hit with patent infrin...,50b8f1a67bb7c2cab2fe56b55a24f59ad1703c07b7053e...,0,These reviews have been especially popular wit...
1,Bharti Airtel Ltd,Reliance Industries Ltd,* Q4 profit 829 mln rupees vs. 3.73 bln rupees...,posted its lowest quarterly profit in nearly ...,"Jio, the telecoms arm of Reliance, into India...",603eeb451da67a70850a823e4eb4e1221a5358040bde3b...,0,* Q4 profit 829 mln rupees vs. 3.73 bln rupees...
2,Reliance Industries Ltd,Bharti Airtel Ltd,"The entry of Reliance Jio, the telecoms arm of",", into India's telecoms sector has set off a b...",'s managing director and chief executive for I...,603eeb451da67a70850a823e4eb4e1221a5358040bde3b...,0,"The entry of Reliance Jio, the telecoms arm of..."
3,Bharti Airtel Ltd,Reliance Industries Ltd,The U.S. Supreme Court on Tuesday ruled that f...,dan-based Arab Bank Plc helped finance militan...,"24, 2016.",26cb6ec3607fdb028b712d260131058e9b01b9aadc26cf...,1,The U.S. Supreme Court on Tuesday ruled that f...
4,Reliance Industries Ltd,Bharti Airtel Ltd,REUTERS/Aziz Ta,PhotoThe 5-4 decision brought to an end a law...,minor connection between the terrorist attacks...,26cb6ec3607fdb028b712d260131058e9b01b9aadc26cf...,1,REUTERS/Aziz Ta Reliance Industries Ltd Photo...


In [4]:
kb_df = pd.read_csv('../nlpproject/nlp/data/example_kb_all_supplier.tsv.gz', compression='gzip', 
                                 names= ['relation', 'subject', 'object'], sep='\t', quotechar='"')

kb_df.head(2)                                

Unnamed: 0,relation,subject,object
0,supplier of,Apple Inc,Seoul Semiconductor Co Ltd
1,supplier of,Koizumi furnitech Thailand Co Ltd,Seoul Semiconductor Co Ltd


In [5]:
def corpus_density_based_split(corpus, kb, split_ratios = np.array([0.6,0.2,0.2]), verbose = True):
    corp = corpus.copy()
    _kb = kb.copy()

    # Create an alphabetically sorted (E1, E2) tuple column of the two entities mentioned in each row
    corp['relation_tuple'] = [tuple(sorted([E1, E2])) for E1, E2 in zip(corp.entity1, corp.entity2)]       
    
    # Create a df which contains the number of entries per unique E1,E2 pair in corpus and sort in descending order
    corpus_counts = corp.groupby(['relation_tuple']).agg('size').reset_index().sort_values(by = 0, ascending=False).rename(columns = {0: 'sentence_counts'})

    _counts_per_split = corpus_counts.sentence_counts.sum() * split_ratios
    _counts_cumsum = _counts_per_split.cumsum()
    print('Aim to have approx {} entries per train/dev/test split, respectively.\n' .format(_counts_per_split))

    # Shuffle this to get rid of numerical order 
    shuffled_corpus_counts = corpus_counts.sample(frac=1, random_state = 42) # Perhaps we can also find deterministic "shuffle"

    # Now add column which gives cumulative counts of sentences down Pairs
    shuffled_corpus_counts['sentence_cumsum'] = np.cumsum(shuffled_corpus_counts.sentence_counts)

    def assign_splits(corpus_counts, counts_cumsum):
        splits = []
        for scs in corpus_counts.sentence_cumsum:
            if scs <= counts_cumsum[0]: splits.append(0)
            elif scs > counts_cumsum[0] and scs <= counts_cumsum[1]: splits.append(1)
            else: splits.append(2)
        return splits

    shuffled_corpus_counts['split'] = assign_splits(shuffled_corpus_counts, _counts_cumsum)

    corp = corp.merge(shuffled_corpus_counts, how = 'left', on = 'relation_tuple')

    # Now merge corpus-based created splits onto KB 
    # MANY KB ENTRIES WILL NOT HAVE MATCHES 
    _kb['relation_tuple'] = [tuple(sorted([E1, E2])) for E1, E2 in zip(_kb.subject.astype(str), _kb.object.astype(str))]

    _kb = _kb.merge(corp[['relation_tuple', 'split']], how = 'left', on = 'relation_tuple')

    if verbose:
        corp_vals = corp.split.value_counts(normalize = False)
        corp_ratios = round(corp.split.value_counts(normalize = True),3)
        corp_balance = round(corp.groupby('split').agg('mean')['label'],2)

        kb_vals = _kb.fillna(99).split.value_counts(normalize = False)
        kb_ratios = round(_kb.fillna(99).split.value_counts(normalize = True),3)

        print('Corpus split:')
        print('Train: {} ({}%) \nDev: {} ({}%)\nTest: {} ({}%) \n' .format(corp_vals[0], corp_ratios[0], corp_vals[1], corp_ratios[1], corp_vals[2], corp_ratios[2]))

        print('Class Balance (positive class):')
        print('Train: {}%\nDev: {}%\nTest: {}% \n' .format(corp_balance[0], corp_balance[1], corp_balance[2]) )

        print('KB split:')
        print('Train: {} ({}%)\nDev: {} ({}%)\nTest: {} ({}%)' .format(kb_vals[0], kb_ratios[0], kb_vals[1], kb_ratios[1], kb_vals[2], kb_ratios[2]))
        print('Total of {} (E1,E2) ordered pairs in KB not assigned. \n' .format(kb_vals[99]))
    
    return corp, _kb

In [6]:
corp_out, kb_out = corpus_density_based_split(corpus_in, kb_df)

Aim to have approx [51778.8 17259.6 17259.6] entries per train/dev/test split, respectively.

Corpus split:
Train: 51555 (0.597%) 
Dev: 17339 (0.201%)
Test: 17404 (0.202%) 

Class Balance (positive class):
Train: 0.57%
Dev: 0.56%
Test: 0.65% 

KB split:
Train: 38941 (0.464%)
Dev: 11510 (0.137%)
Test: 16994 (0.203%)
Total of 16455 (E1,E2) ordered pairs in KB not assigned. 



In [7]:
corp_out

Unnamed: 0,entity1,entity2,left,middle,right,article,label,sentences,relation_tuple,sentence_counts,sentence_cumsum,split
0,Apple Inc,Samsung Electronics Co Ltd,These reviews have been especially popular wit...,(AAPL.O) and,(005930.KS) frequently hit with patent infrin...,50b8f1a67bb7c2cab2fe56b55a24f59ad1703c07b7053e...,0,These reviews have been especially popular wit...,"(Apple Inc, Samsung Electronics Co Ltd)",2962,39367,0
1,Bharti Airtel Ltd,Reliance Industries Ltd,* Q4 profit 829 mln rupees vs. 3.73 bln rupees...,posted its lowest quarterly profit in nearly ...,"Jio, the telecoms arm of Reliance, into India...",603eeb451da67a70850a823e4eb4e1221a5358040bde3b...,0,* Q4 profit 829 mln rupees vs. 3.73 bln rupees...,"(Bharti Airtel Ltd, Reliance Industries Ltd)",36,21306,0
2,Reliance Industries Ltd,Bharti Airtel Ltd,"The entry of Reliance Jio, the telecoms arm of",", into India's telecoms sector has set off a b...",'s managing director and chief executive for I...,603eeb451da67a70850a823e4eb4e1221a5358040bde3b...,0,"The entry of Reliance Jio, the telecoms arm of...","(Bharti Airtel Ltd, Reliance Industries Ltd)",36,21306,0
3,Bharti Airtel Ltd,Reliance Industries Ltd,The U.S. Supreme Court on Tuesday ruled that f...,dan-based Arab Bank Plc helped finance militan...,"24, 2016.",26cb6ec3607fdb028b712d260131058e9b01b9aadc26cf...,1,The U.S. Supreme Court on Tuesday ruled that f...,"(Bharti Airtel Ltd, Reliance Industries Ltd)",36,21306,0
4,Reliance Industries Ltd,Bharti Airtel Ltd,REUTERS/Aziz Ta,PhotoThe 5-4 decision brought to an end a law...,minor connection between the terrorist attacks...,26cb6ec3607fdb028b712d260131058e9b01b9aadc26cf...,1,REUTERS/Aziz Ta Reliance Industries Ltd Photo...,"(Bharti Airtel Ltd, Reliance Industries Ltd)",36,21306,0
...,...,...,...,...,...,...,...,...,...,...,...,...
86293,Microsoft Corp,Amazon.com Inc,The showing is a chance for Google and,", which have been bit players in a VOD market ...","(AMZN.O) and cable and satellite operators, t...",5ebe8ed41ddfb003b44e24135a68a3583f4871a7f9f2fb...,0,The showing is a chance for Google and Micros...,"(Amazon.com Inc, Microsoft Corp)",1093,35525,0
86294,Microsoft Corp,Apple Inc,The showing is a chance for Google and,", which have been bit players in a VOD market ...","(AAPL.O), Amazon (AMZN.O) and cable and satel...",5ebe8ed41ddfb003b44e24135a68a3583f4871a7f9f2fb...,1,The showing is a chance for Google and Micros...,"(Apple Inc, Microsoft Corp)",2052,66753,1
86295,Snap Inc,Twitter Inc,By Tim Baysinger Feb 8 Snap's,chat lags far behind rival social media outlet...,"in reaching older users, but the soon-to-be p...",ec4fad75485dc2fea9290bc86fc9e4c22e48901b547ad7...,0,By Tim Baysinger Feb 8 Snap's Snap Inc chat l...,"(Snap Inc, Twitter Inc)",83,4813,0
86296,Twitter Inc,Snap Inc,By Tim Baysinger Feb 8 Snap's Snapchat lags fa...,"in reaching older users, but the soon-to-be p...",prepares for its planned stock market debut i...,ec4fad75485dc2fea9290bc86fc9e4c22e48901b547ad7...,0,By Tim Baysinger Feb 8 Snap's Snapchat lags fa...,"(Snap Inc, Twitter Inc)",83,4813,0


In [8]:
kb_out.split.value_counts(dropna =False)

0.0    38941
2.0    16994
NaN    16455
1.0    11510
Name: split, dtype: int64

In [9]:
kb_out

Unnamed: 0,relation,subject,object,relation_tuple,split
0,supplier of,Apple Inc,Seoul Semiconductor Co Ltd,"(Apple Inc, Seoul Semiconductor Co Ltd)",
1,supplier of,Koizumi furnitech Thailand Co Ltd,Seoul Semiconductor Co Ltd,"(Koizumi furnitech Thailand Co Ltd, Seoul Semi...",
2,supplier of,Amazon.com Inc,DS Smith PLC,"(Amazon.com Inc, DS Smith PLC)",
3,supplier of,Procter & Gamble Co,DS Smith PLC,"(DS Smith PLC, Procter & Gamble Co)",
4,supplier of,Zymeworks Inc,DS Smith PLC,"(DS Smith PLC, Zymeworks Inc)",
...,...,...,...,...,...
83895,supplier of,Cisco Systems Inc (Pre-Merger),Synetrix (Holdings) Ltd,"(Cisco Systems Inc (Pre-Merger), Synetrix (Hol...",
83896,supplier of,Cisco Systems Inc (Pre-Merger),Servcorp Ltd,"(Cisco Systems Inc (Pre-Merger), Servcorp Ltd)",
83897,supplier of,Cisco Systems Inc (Pre-Merger),SolarWinds Corp,"(Cisco Systems Inc (Pre-Merger), SolarWinds Corp)",
83898,supplier of,Cisco Systems Inc (Pre-Merger),Unified Communications Ltd,"(Cisco Systems Inc (Pre-Merger), Unified Commu...",


In [12]:
kb_out[kb_out.subject=='Apple Inc']

Unnamed: 0,relation,subject,object,relation_tuple,split
0,supplier of,Apple Inc,Seoul Semiconductor Co Ltd,"(Apple Inc, Seoul Semiconductor Co Ltd)",
158,supplier of,Apple Inc,TDK Corp,"(Apple Inc, TDK Corp)",2.0
159,supplier of,Apple Inc,TDK Corp,"(Apple Inc, TDK Corp)",2.0
160,supplier of,Apple Inc,TDK Corp,"(Apple Inc, TDK Corp)",2.0
161,supplier of,Apple Inc,TDK Corp,"(Apple Inc, TDK Corp)",2.0
...,...,...,...,...,...
78457,supplier of,Apple Inc,Samsung Group,"(Apple Inc, Samsung Group)",0.0
78458,supplier of,Apple Inc,Global Unichip Corp,"(Apple Inc, Global Unichip Corp)",
78459,supplier of,Apple Inc,Electronic Arts Inc,"(Apple Inc, Electronic Arts Inc)",0.0
78460,supplier of,Apple Inc,Pelephone Communications Ltd,"(Apple Inc, Pelephone Communications Ltd)",
