In [1]:
import pandas as pd
import re
import multiprocessing

In [2]:
cta_train_gt = pd.read_csv('SOTAB CTA/sotab_v2_cta_training_set.csv')
cta_test_gt = pd.read_csv('SOTAB CTA/sample_test.csv')

In [3]:
cta_train_gt

Unnamed: 0,table_name,column_index,label
0,Book_11x17.pt_September2020_CTA.json.gz,3,Date
1,Book_12min.com_September2020_CTA.json.gz,0,Book/name
2,Book_12min.com_September2020_CTA.json.gz,2,Language
3,Book_1carpetcleaning.co.uk_September2020_CTA.j...,7,Person/name
4,Book_1carpetcleaning.co.uk_September2020_CTA.j...,2,BookFormatType
...,...,...,...
116882,TVEpisode_zazangels.com_September2020_CTA.json.gz,3,CreativeWorkSeries
116883,TVEpisode_zebrahead.org_September2020_CTA.json.gz,0,TVEpisode/name
116884,TVEpisode_zebrahead.org_September2020_CTA.json.gz,3,CreativeWorkSeries
116885,TVEpisode_zoids-col.net_September2020_CTA.json.gz,1,TVEpisode/name


In [6]:
cta_test_gt

Unnamed: 0,table_name,column_index,label
0,Event_blugrottonj.com_September2020_CTA.json.gz,4,Place/name
1,Event_blugrottonj.com_September2020_CTA.json.gz,5,telephone
2,LocalBusiness_basellive.ch_September2020_CTA.j...,2,openingHours
3,LocalBusiness_worldcement.com_September2020_CT...,5,Country
4,Movie_321movies.org_September2020_CTA.json.gz,10,Rating
...,...,...,...
313,Product_babyartikelcheck.de_September2020_CTA....,0,Product/name
314,Product_divas-club.de_September2020_CTA.json.gz,0,Product/name
315,Product_divas-club.de_September2020_CTA.json.gz,1,Product/description
316,Person_crsdenver.com_September2020_CTA.json.gz,3,faxNumber


In [8]:
cta_test_gt["class"] = cta_test_gt["table_name"].apply(lambda x: x.split("_")[0])

In [9]:
cta_test_gt

Unnamed: 0,table_name,column_index,label,class
0,Event_blugrottonj.com_September2020_CTA.json.gz,4,Place/name,Event
1,Event_blugrottonj.com_September2020_CTA.json.gz,5,telephone,Event
2,LocalBusiness_basellive.ch_September2020_CTA.j...,2,openingHours,LocalBusiness
3,LocalBusiness_worldcement.com_September2020_CT...,5,Country,LocalBusiness
4,Movie_321movies.org_September2020_CTA.json.gz,10,Rating,Movie
...,...,...,...,...
313,Product_babyartikelcheck.de_September2020_CTA....,0,Product/name,Product
314,Product_divas-club.de_September2020_CTA.json.gz,0,Product/name,Product
315,Product_divas-club.de_September2020_CTA.json.gz,1,Product/description,Product
316,Person_crsdenver.com_September2020_CTA.json.gz,3,faxNumber,Person


In [4]:
gt = {'train':{}, 'test':{}}
for index, row in cta_train_gt.iterrows():
    if row["table_name"] not in gt['train']:
        gt['train'][row["table_name"]] = {}
    gt['train'][row["table_name"]][row["column_index"]] = row["label"]
    
for index, row in cta_test_gt.iterrows():
    if row["table_name"] not in gt['test']:
        gt['test'][row["table_name"]] = {}
    gt['test'][row["table_name"]][row["column_index"]] = row["label"]

In [5]:
#Simple Preprocessing

def clean_text(text):
        
    if(isinstance(text, dict)):
        text = ' '.join([ clean_text(v) for k, v in text.items()] )
    elif(isinstance(text, list)):
        text = map(clean_text, text)
        text = ' '.join(text)
        
    if pd.isnull(text):
        return ''
    
    #Remove excess whitespaces
    text = re.sub(' +', ' ', str(text)).strip()
    
    return text

In [None]:
# Prepare format of input datasets for LM models: table_id, [labels], data, label_ids

#ORIGINAL
""" 
def get_all_table_columns(file_name, index):
    
    #By column
    if file_name in cpa_train_gt["table_name"].tolist():
        path = 'SOTAB CPA/Train/'+file_name
    else:
        path = 'SOTAB CPA/Test/'+file_name
    
    df = pd.read_json(path, compression='gzip', lines=True)
        
    cleaned_rows = []
    
    #Main column

    cleaned_main = " ".join([" ".join(clean_text(row).split()[:20]) for row in df[0].tolist()[:5]]) #select 20 words
    
    cleaned_rows.append(cleaned_main)
    
    for row in df.iloc[:, index].tolist():
        cleaned = " ".join(clean_text(row).split()[:20]) #select 20 words
        if cleaned != "":
            cleaned_rows.append(cleaned)
    
    return " ".join(cleaned_rows[:5]) #select rows """

In [6]:
# Prepare format of input datasets for LM models: table_id, [labels], data, label_ids


#COLUMN TYPE MODIFIED
def get_all_table_columns(file_name, index):
    
    #By column
    if file_name in cta_train_gt["table_name"].tolist():
        path = 'SOTAB CTA/Train/'+file_name
    else:
        path = 'SOTAB CTA/Test/'+file_name
    
    df = pd.read_json(path, compression='gzip', lines=True)
        
    cleaned_rows = []
    
 
    for row in df.iloc[:, index].tolist():
        
        cleaned = " ".join(clean_text(row).split()[:20]) #select 20 words
        
        if cleaned != "":
            cleaned_rows.append(cleaned)
    
    
    return " ".join(cleaned_rows[:5]) #select rows

In [19]:
test_examples = []
for table in gt['test']:
    for column in gt['test'][table]:
        col_str = get_all_table_columns(table, column)
        test_examples.append([table, column, col_str, gt['test'][table][column], table.split("_")[0]])

In [20]:
test_examples

[['Event_blugrottonj.com_September2020_CTA.json.gz',
  4,
  'Blu Grotto Blu Grotto',
  'Place/name',
  'Event'],
 ['Event_blugrottonj.com_September2020_CTA.json.gz',
  5,
  '(732) 571-7900 (732) 571-7900',
  'telephone',
  'Event'],
 ['LocalBusiness_basellive.ch_September2020_CTA.json.gz',
  2,
  'Th 12:00-18:30, Fr 12:00-18:30, Sa 11:00-17:00 Mo 09:00-19:00, Tu 09:00-19:00, We 09:00-19:00, Th 09:00-19:00, Fr 09:00-19:00, Sa 09:00-18:00 We 14:00-19:00, Th 14:00-19:00, Fr 14:00-19:00, Sa 11:00-16:00 Mo 11:00-23:00, Tu 11:00-23:00, We 11:00-23:00, Th 11:00-23:00, Fr 11:00-00:00, Sa 09:00-00:00, Su 09:00-23:00 Mo 09:00-19:00, Tu 09:00-19:00, We 09:00-19:00, Th 09:00-20:00, Fr 09:00-20:00, Sa 09:00-18:00',
  'openingHours',
  'LocalBusiness'],
 ['LocalBusiness_worldcement.com_September2020_CTA.json.gz',
  5,
  'United Kingdom United Kingdom United Kingdom Germany United Kingdom',
  'Country',
  'LocalBusiness'],
 ['LocalBusiness_worldcement.com_September2020_CTA.json.gz',
  1,
  'Independe

In [1]:
cta_test_gt[cta_test_gt["table_name"] == "Test\Book_1jour-1jeu.com_September2020_CTA.json.gz"] 

NameError: name 'cta_test_gt' is not defined

In [7]:
train_examples = []
for table in gt['train']:
    for column in gt['train'][table]:
        col_str = get_all_table_columns(table, column)
        train_examples.append([table, column, col_str, gt['train'][table][column], table.split("_")[0]])

In [None]:
len(train_examples)

116887

In [None]:
train_examples

In [16]:
import pickle

In [18]:
""" file_name='SOTAB CTA/sotabv2-cta-train-column.pkl'
f = open(file_name,'wb')
pickle.dump(train_examples,f)
f.close() """

In [17]:
file_name='SOTAB CTA/sotabv2-cta-sample-test-column.pkl'
f = open(file_name,'wb')
pickle.dump(test_examples,f)
f.close()