In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("data/train.csv")
dataset.drop_duplicates(inplace=True)
dataset.shape

(79383, 4)

In [3]:
dataset.head()

Unnamed: 0,company1,company2,is_parent,snippet
0,Sprint_Corporation,Verizon_Communications,False,1 wireless carrier Verizon_Communications (NY...
1,Sprint_Corporation,Verizon_Communications,False,"While AT&T, Sprint_Corporation, and T-Mobile ..."
2,Sprint_Corporation,Verizon_Communications,False,"\nAT&T, Sprint_Corporation, and Verizon_Commun..."
3,Alexa_Internet,Amazon.com,False,Logitech addsAmazon.comn'sAlexa_Interneta skil...
4,Alexa_Internet,Amazon.com,False,\nLogitech has announced a new version of the ...


In [4]:
# using this strategy to fix the problem (stated in the paper) for pairs order

def preprocess(dataset):
    aliased_snippet = []
    companies = dataset["company1"].append(dataset["company2"]).value_counts().keys()
    for i in range(dataset.shape[0]):
        current_row = dataset.iloc[i]
        snippet = current_row["snippet"]
         # I am adding more spaces cuz in some samples the words and concatanated
        for company in companies:
            snippet = snippet.replace(company, ' ' + company +' ')
        preprocessed = snippet.replace(current_row["company1"]," company1 ").replace(current_row["company2"]," company2 ").replace("\xa0", " ").replace("\n", " ")
                
        aliased_snippet.append(preprocessed)
    dataset['aliased_snippet'] = aliased_snippet

    dataset['aliased_snippet'] = dataset['aliased_snippet'].str.lower()
    print("Companies shape",companies.shape)
    return dataset

In [5]:
dataset = preprocess(dataset)
dataset.shape

Companies shape (451,)


(79383, 5)

In [6]:
# I will split the train data to train,dev,test in ratio 70/20/10
from sklearn.model_selection import train_test_split
train, other = train_test_split(dataset, stratify=dataset["is_parent"],test_size=0.3,random_state=26)
train.shape, other.shape

((55568, 5), (23815, 5))

In [7]:
train["is_parent"].value_counts()

False    39038
True     16530
Name: is_parent, dtype: int64

In [8]:
other["is_parent"].value_counts()

False    16730
True      7085
Name: is_parent, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
dev,test = train_test_split(other, stratify=other["is_parent"], test_size=(1/3), random_state=26)
dev.shape, test.shape

((15876, 5), (7939, 5))

Lets check whether we splitted it correctly

In [10]:
def in_percent(ratio):
    return ratio*100

print(in_percent(train.shape[0]/dataset.shape[0]))
print(in_percent(dev.shape[0]/dataset.shape[0]))
print(in_percent(test.shape[0]/dataset.shape[0]))

69.99987402844438
19.999244170666262
10.00088180088936


In [11]:
%mkdir split

mkdir: cannot create directory ‘split’: File exists


In [12]:
train.to_csv("split/train.csv")
dev.to_csv("split/dev.csv")
test.to_csv("split/test.csv")

In [13]:
train["is_parent"].value_counts()

False    39038
True     16530
Name: is_parent, dtype: int64

In [14]:
dev["is_parent"].value_counts()

False    11153
True      4723
Name: is_parent, dtype: int64

In [15]:
test["is_parent"].value_counts()

False    5577
True     2362
Name: is_parent, dtype: int64

### Now lets preprocess the unlabeled test set in order to use it as corpus for more words and prepare it for input in the models

In [21]:
onto_test = pd.read_csv("data/test-labeled.csv")
onto_test.drop_duplicates(inplace=True)
onto_test.shape

(18002, 7)

In [22]:
onto_test.head()

Unnamed: 0,entity1ID,entity2ID,label1,label2,relation,snippet,relation.1
0,497,494,Ford_Motor_Company,Holden,,95s to top the sheets ahead of Kiwi Fabian Cou...,False
2,188,244,Apple_Inc.,HBO,,\nGamers who want to access HBO Now on the Xbo...,False
3,188,244,Apple_Inc.,HBO,,\nHBO first launched its standalone subscripti...,False
5,456,314,Google,Verizon_Communications,,\nGoogle's business immediately took a hit : p...,False
6,522,178,Twitter,Beats_Electronics,,"His endorsement list ain't bad either, includ...",False


In [23]:
onto_test["company1"] = onto_test["label1"]
onto_test["company2"] = onto_test["label2"]
onto_test["is_parent"] = onto_test["relation.1"]

In [28]:
onto_test["relation"].value_counts()

Series([], Name: relation, dtype: int64)

In [25]:
onto_test = preprocess(onto_test)
onto_test.head()

Companies shape (279,)


Unnamed: 0,entity1ID,entity2ID,label1,label2,relation,snippet,relation.1,company1,company2,is_parent,aliased_snippet
0,497,494,Ford_Motor_Company,Holden,,95s to top the sheets ahead of Kiwi Fabian Cou...,False,Ford_Motor_Company,Holden,False,95s to top the sheets ahead of kiwi fabian cou...
2,188,244,Apple_Inc.,HBO,,\nGamers who want to access HBO Now on the Xbo...,False,Apple_Inc.,HBO,False,gamers who want to access company2 now on...
3,188,244,Apple_Inc.,HBO,,\nHBO first launched its standalone subscripti...,False,Apple_Inc.,HBO,False,company2 first launched its standalone su...
5,456,314,Google,Verizon_Communications,,\nGoogle's business immediately took a hit : p...,False,Google,Verizon_Communications,False,company1 's business immediately took a hi...
6,522,178,Twitter,Beats_Electronics,,"His endorsement list ain't bad either, includ...",False,Twitter,Beats_Electronics,False,"his endorsement list ain't bad either, includ..."


In [19]:
%mkdir processed

mkdir: cannot create directory ‘processed’: File exists


In [26]:
onto_test.to_csv("processed/test.csv", index_label=False)