In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("data/train.csv")
dataset.drop_duplicates(inplace=True)
dataset.shape

(79383, 4)

In [3]:
dataset.head()

Unnamed: 0,company1,company2,is_parent,snippet
0,Sprint_Corporation,Verizon_Communications,False,1 wireless carrier Verizon_Communications (NY...
1,Sprint_Corporation,Verizon_Communications,False,"While AT&T, Sprint_Corporation, and T-Mobile ..."
2,Sprint_Corporation,Verizon_Communications,False,"\nAT&T, Sprint_Corporation, and Verizon_Commun..."
3,Alexa_Internet,Amazon.com,False,Logitech addsAmazon.comn'sAlexa_Interneta skil...
4,Alexa_Internet,Amazon.com,False,\nLogitech has announced a new version of the ...


In [4]:
# using this strategy to fix the problem (stated in the paper) for pairs order

def preprocess(dataset):
    aliased_snippet = []
    companies = dataset["company1"].append(dataset["company2"]).value_counts().keys()
    for i in range(dataset.shape[0]):
        current_row = dataset.iloc[i]
         # I am adding more spaces cuz in some samples the words and concatanated
        for company in companies:
            current_row["snippet"].replace(company, ' ' + company +' ')

        aliased_snippet.append(current_row["snippet"]
                               .replace(current_row["company1"],' company1 ')
                               .replace(current_row["company2"],' company2 '))
    dataset['aliased_snippet'] = aliased_snippet

    dataset['aliased_snippet'] = dataset['aliased_snippet'].str.lower()
    print("Companies shape",companies.shape)
    return dataset

In [5]:
dataset = preprocess(dataset)
dataset.shape

Companies shape (451,)


(79383, 5)

In [6]:
# I will split the train data to train,dev,test in ratio 70/20/10
from sklearn.model_selection import train_test_split
train, other = train_test_split(dataset, stratify=dataset["is_parent"],test_size=0.3,random_state=26)
train.shape, other.shape

((55568, 5), (23815, 5))

In [7]:
train["is_parent"].value_counts()

False    39038
True     16530
Name: is_parent, dtype: int64

In [8]:
other["is_parent"].value_counts()

False    16730
True      7085
Name: is_parent, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
dev,test = train_test_split(other, stratify=other["is_parent"], test_size=(1/3), random_state=26)
dev.shape, test.shape

((15876, 5), (7939, 5))

Lets check whether we splitted it correctly

In [10]:
def in_percent(ratio):
    return ratio*100

print(in_percent(train.shape[0]/dataset.shape[0]))
print(in_percent(dev.shape[0]/dataset.shape[0]))
print(in_percent(test.shape[0]/dataset.shape[0]))

69.99987402844438
19.999244170666262
10.00088180088936


In [11]:
%mkdir split

mkdir: cannot create directory ‘split’: File exists


In [12]:
train.to_csv("split/train.csv")
dev.to_csv("split/dev.csv")
test.to_csv("split/test.csv")

In [13]:
train["is_parent"].value_counts()

False    39038
True     16530
Name: is_parent, dtype: int64

In [14]:
dev["is_parent"].value_counts()

False    11153
True      4723
Name: is_parent, dtype: int64

In [15]:
test["is_parent"].value_counts()

False    5577
True     2362
Name: is_parent, dtype: int64

### Now lets preprocess the unlabeled test set in order to use it as corpus for more words and prepare it for input in the models

In [16]:
onto_test = pd.read_csv("./test/test.csv")
onto_test.drop_duplicates(inplace=True)
onto_test.shape

FileNotFoundError: File b'./test/test.csv' does not exist

In [None]:
onto_test.head()

In [None]:
onto_test = preprocess(onto_test)
onto_test.head()

In [None]:
%mkdir processed

In [None]:
onto_test.to_csv("processed/test.csv", index_label=False)