In [1]:
import pandas as pd
import re
import multiprocessing

In [2]:
cpa_train_gt = pd.read_csv('SOTAB CPA/sotab_v2_cpa_training_set.csv')
cpa_test_gt = pd.read_csv('SOTAB CPA/sample_test.csv')

In [3]:
len(cpa_test_gt)

509

In [5]:
cpa_train_gt

Unnamed: 0,table_name,main_column_index,column_index,label
0,Book_11x17.pt_September2020_CPA.json.gz,0,3,datePublished
1,Book_11x17.pt_September2020_CPA.json.gz,0,1,isbn
2,Book_11x17.pt_September2020_CPA.json.gz,0,2,numberOfPages
3,Book_1jour-1jeu.com_September2020_CPA.json.gz,0,9,worstRating
4,Book_1jour-1jeu.com_September2020_CPA.json.gz,0,5,datePublished
...,...,...,...,...
109989,TVEpisode_yifytvseries.com_September2020_CPA.j...,0,3,datePublished
109990,TVEpisode_zazangels.com_September2020_CPA.json.gz,0,5,duration
109991,TVEpisode_zazangels.com_September2020_CPA.json.gz,0,1,description
109992,TVEpisode_zazangels.com_September2020_CPA.json.gz,0,2,url


In [4]:
cpa_test_gt

Unnamed: 0,table_name,main_column_index,column_index,label
0,Book_antipodean.com_September2020_CPA.json.gz,0,4,publisher
1,Event_bfodurham.net_September2020_CPA.json.gz,0,3,eventStatus
2,Event_bfodurham.net_September2020_CPA.json.gz,0,4,eventAttendanceMode
3,Event_healthychelsea.org_September2020_CPA.jso...,0,4,organizer
4,Event_healthychelsea.org_September2020_CPA.jso...,0,5,telephone
...,...,...,...,...
504,Product_cit.li_September2020_CPA.json.gz,0,2,weight
505,Product_cit.li_September2020_CPA.json.gz,0,4,productID
506,Product_coininvest.com_September2020_CPA.json.gz,0,4,manufacturer
507,Recipe_bakedbyanintrovert.com_September2020_CP...,0,11,recipeInstructions


In [6]:
cpa_test_gt["class"] = cpa_test_gt["table_name"].apply(lambda x: x.split("_")[0])

In [6]:
cpa_test_gt

Unnamed: 0,table_name,main_column_index,column_index,label,class
0,Book_antipodean.com_September2020_CPA.json.gz,0,4,publisher,Book
1,Event_bfodurham.net_September2020_CPA.json.gz,0,3,eventStatus,Event
2,Event_bfodurham.net_September2020_CPA.json.gz,0,4,eventAttendanceMode,Event
3,Event_healthychelsea.org_September2020_CPA.jso...,0,4,organizer,Event
4,Event_healthychelsea.org_September2020_CPA.jso...,0,5,telephone,Event
...,...,...,...,...,...
504,Product_cit.li_September2020_CPA.json.gz,0,2,weight,Product
505,Product_cit.li_September2020_CPA.json.gz,0,4,productID,Product
506,Product_coininvest.com_September2020_CPA.json.gz,0,4,manufacturer,Product
507,Recipe_bakedbyanintrovert.com_September2020_CP...,0,11,recipeInstructions,Recipe


In [7]:
gt = {'train':{}, 'test':{}}
for index, row in cpa_train_gt.iterrows():
    if row["table_name"] not in gt['train']:
        gt['train'][row["table_name"]] = {}
    gt['train'][row["table_name"]][row["column_index"]] = row["label"]
    
for index, row in cpa_test_gt.iterrows():
    if row["table_name"] not in gt['test']:
        gt['test'][row["table_name"]] = {}
    gt['test'][row["table_name"]][row["column_index"]] = row["label"]

In [8]:
#Simple Preprocessing

def clean_text(text):
        
    if(isinstance(text, dict)):
        text = ' '.join([ clean_text(v) for k, v in text.items()] )
    elif(isinstance(text, list)):
        text = map(clean_text, text)
        text = ' '.join(text)
        
    if pd.isnull(text):
        return ''
    
    #Remove excess whitespaces
    text = re.sub(' +', ' ', str(text)).strip()
    
    return text

In [11]:
# Prepare format of input datasets for LM models: table_id, [labels], data, label_ids


#COLUMN PROPERTY MODIFIED

def get_all_table_columns(file_name, index):
    
    #By column
    if file_name in cpa_train_gt["table_name"].tolist():
        path = 'SOTAB CPA/Train/'+file_name
    else:
        path = 'SOTAB CPA/Test/'+file_name
    
    df = pd.read_json(path, compression='gzip', lines=True)
        
    cleaned_rows = []
    
    #Main column
    cleaned_main="{}{}"

    x = " ".join([" ".join(clean_text(row).split()[:20]) for row in df[0].tolist()[:5]]) #select 20 words
    cleaned_main=cleaned_main.format("Column 1: ", x)
    cleaned_rows.append(cleaned_main)

    cleaned_rows.append("\n Column 2:")
    
    
    for row in df.iloc[:, index].tolist():
        get_values_var=7
        cleaned = " ".join(clean_text(row).split()[:20]) #select 20 words
        
        if cleaned != "":
            cleaned_rows.append(cleaned)
        else:
            get_values_var=get_values_var-1

       
    
    return " ".join(cleaned_rows[:get_values_var]) #select rows

In [12]:
test_examples = []
for table in gt['test']:
    for column in gt['test'][table]:
        col_str = get_all_table_columns(table, column)
        test_examples.append([table, column, col_str, gt['test'][table][column], table.split("_")[0]])

In [11]:
col_str

"Column 1: Salva El Tigre Pimemento Jim Carrey Chapter Two: You Can't Hurry Love Web Reflection \n Column 2: Narcos: Mexico Brooklyn Nine-Nine Lights Out with David Spade Katy Keene Tosh.0"

In [13]:
test_examples

[['Book_antipodean.com_September2020_CPA.json.gz',
  4,
  "Column 1: On Coxalgia, or Hip Disease. Simonds Saws & Knives, Gold Medals and Highest Awards Everywhere. Atlanta 1895. Art Nouveau poster. North Island, New Zealand, Sheet 3, northeast corner from Tauranga to Dannevirke, folding map on linen. Trade Receipts for wine and beer from Perth, Australia merchants 'H. Sherwood & Co., Wine & Spirit Merchants' and 'D. The American Trans-Continental Route via New York Central & Hudson River R. R. and Connections, the Only 4 Track Line. \n Column 2: Collins, Livermore & Knight Co, Marcus F. Marks, Government Printer. Burlington. Visual Anthropology,",
  'publisher',
  'Book'],
 ['Book_antipodean.com_September2020_CPA.json.gz',
  3,
  "Column 1: On Coxalgia, or Hip Disease. Simonds Saws & Knives, Gold Medals and Highest Awards Everywhere. Atlanta 1895. Art Nouveau poster. North Island, New Zealand, Sheet 3, northeast corner from Tauranga to Dannevirke, folding map on linen. Trade Receipts f

In [14]:
len(test_examples)

509

In [13]:
cpa_test_gt[cpa_test_gt["table_name"] == "Book_9facts.co.uk_September2020_CPA.json.gz"]

Unnamed: 0,table_name,main_column_index,column_index,label,class


In [None]:
train_examples = []
for table in gt['train']:
    for column in gt['train'][table]:
        col_str = get_all_table_columns(table, column)
        train_examples.append([table, column, col_str, gt['train'][table][column], table.split("_")[0]])

In [None]:
len(train_examples)

109994

In [14]:
import pickle

In [None]:
""" file_name='SOTAB CPA/sotabv2-cpa-train-column.pkl'
f = open(file_name,'wb')
pickle.dump(train_examples,f)
f.close() """

In [15]:
file_name='SOTAB CPA/sotabv2-cpa-sample-test-column.pkl'
f = open(file_name,'wb')
pickle.dump(test_examples,f)
f.close()