In [2]:
import os
import re
import pickle
import numpy as np
from dotenv import dotenv_values
from langchain import PromptTemplate, LLMChain, OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage, AIMessage

In [3]:
config = dotenv_values("../.env")
os.environ['OPENAI_API_KEY'] = config["OPENAI_API_KEY"]
OPENAI_API_KEY = config["OPENAI_API_KEY"]

In [4]:
labels_to_text = {
    "Date": "date",
    "Book/name": "name of book",
    "Language": "language",
    "Person/name": "name of person",
    "BookFormatType": "book format type",
    "OfferItemCondition": "offer item condition",
    "ItemAvailability": "item availability",
    "price": "price",
    "currency": "currency",
    "Review": "review",
    "Number": "number",
    "IdentifierAT": "identifier",
    "URL": "url",
    "Place/name": "name of place",
    "Event/name": "name of event",
    "EventStatusType": "event status",
    "EventAttendanceModeEnumeration": "event attendance mode",
    "telephone": "telephone",
    "email": "email",
    "category": "category",
    "Duration": "duration",
    "streetAddress": "street address",
    "addressLocality": "locality of address",
    "LocalBusiness/name": "name of local business",
    "priceRange": "price range",
    "openingHours": "opening hours",
    "faxNumber": "fax number",
    "Country": "country",
    "postalCode": "postal code",
    "addressRegion": "region of address",
    "Photograph": "photograph",
    "Movie/name": "name of movie",
    "Rating": "rating",
    "MusicArtistAT": "music artist",
    "MusicAlbum/name": "name of music album",
    "MusicRecording/name": "name of music recording",
    "weight": "weight",
    "GenderType": "gender type",
    "Product/name": "name of product",
    "DeliveryMethod": "delivery method",
    "Organization": "organization",
    "Book/description": "description of book",
    "CreativeWork": "creative work",
    "Boolean": "boolean",
    "DateTime": "date and time",
    "CreativeWork/name": "name of creative work",
    "Event/description": "description of event",
    "PostalAddress": "postal address",
    "Time": "time",
    "Hotel/name": "name of hotel",
    "CoordinateAT": "coordinate",
    "Hotel/description": "description of hotel",
    "LocationFeatureSpecification": "location feature",
    "paymentAccepted": "payment accepted",
    "Brand": "brand",
    "MonetaryAmount": "monetary amount",
    "JobPosting/name": "name of job posting",
    "OccupationalExperienceRequirements": "occupational experience requirements",
    "EducationalOccupationalCredential": "educational occupational credential",
    "workHours": "work hours",
    "CategoryCode": "category code",
    "JobPosting/description": "description of job posting",
    "DayOfWeek": "day of week",
    "Movie/description": "description of movie",
    "Museum/name": "name of museum",
    "ItemList": "item list",
    "Distance": "distance",
    "unitCode": "unit code",
    "ProductModel": "product model",
    "unitText": "unit text",
    "QuantitativeValue": "quantitative value",
    "Product/description": "description of product",
    "Recipe/name": "name of recipe",
    "Mass": "mass",
    "Energy": "energy",
    "RestrictedDiet": "restricted diet",
    "Recipe/description": "description of recipe",
    "Restaurant/name": "name of restaurant",
    "SportsEvent/name": "name of sports event",
    "SportsTeam": "sports team",
    "TVEpisode/name": "name of TV episode",
    "CreativeWorkSeries": "creative work series"
}

In [5]:
text_to_label = {
    "date": "Date",
    "name of book": "Book/name",
    "language": "Language",
    "name of person": "Person/name",
    "book format type": "BookFormatType",
    "offer item condition": "OfferItemCondition",
    "item availability": "ItemAvailability",
    "price": "price",
    "currency": "currency",
    "review": "Review",
    "number": "Number",
    "identifier": "IdentifierAT",
    "url": "URL",
    "name of place": "Place/name",
    "name of event": "Event/name",
    "event status": "EventStatusType",
    "event attendance mode": "EventAttendanceModeEnumeration",
    "telephone": "telephone",
    "email": "email",
    "category": "category",
    "duration": "Duration",
    "street address": "streetAddress",
    "locality of address": "addressLocality",
    "name of local business": "LocalBusiness/name",
    "price range": "priceRange",
    "opening hours": "openingHours",
    "fax number": "faxNumber",
    "country": "Country",
    "postal code": "postalCode",
    "region of address": "addressRegion",
    "photograph": "Photograph",
    "name of movie": "Movie/name",
    "rating": "Rating",
    "music artist": "MusicArtistAT",
    "name of music album": "MusicAlbum/name",
    "name of music recording": "MusicRecording/name",
    "weight": "weight",
    "gender type": "GenderType",
    "name of product": "Product/name",
    "delivery method": "DeliveryMethod",
    "organization": "Organization",
    "description of book": "Book/description",
    "creative work": "CreativeWork",
    "boolean": "Boolean",
    "date and time": "DateTime",
    "name of creative work": "CreativeWork/name",
    "description of event": "Event/description",
    "postal address": "PostalAddress",
    "time": "Time",
    "name of hotel": "Hotel/name",
    "coordinate": "CoordinateAT",
    "description of hotel": "Hotel/description",
    "location feature": "LocationFeatureSpecification",
    "payment accepted": "paymentAccepted",
    "brand": "Brand",
    "monetary amount": "MonetaryAmount",
    "name of job posting": "JobPosting/name",
    "occupational experience requirements": "OccupationalExperienceRequirements",
    "educational occupational credential": "EducationalOccupationalCredential",
    "work hours": "workHours",
    "category code": "CategoryCode",
    "description of job posting": "JobPosting/description",
    "day of week": "DayOfWeek",
    "description of movie": "Movie/description",
    "name of museum": "Museum/name",
    "item list": "ItemList",
    "distance": "Distance",
    "unit code": "unitCode",
    "product model": "ProductModel",
    "unit text": "unitText",
    "quantitative value": "QuantitativeValue",
    "description of product": "Product/description",
    "name of recipe": "Recipe/name",
    "mass": "Mass",
    "energy": "Energy",
    "restricted diet": "RestrictedDiet",
    "description of recipe": "Recipe/description",
    "name of restaurant": "Restaurant/name",
    "name of sports event": "SportsEvent/name",
    "sports team": "SportsTeam",
    "name of TV episode": "TVEpisode/name",
    "creative work series": "CreativeWorkSeries"
}

## Load test (and training) set

In [7]:
with open('sotabv2-cta-train-table.pkl', "rb") as f:
    train = pickle.load(f)
with open('sotabv2-cta-test-table.pkl', "rb") as f:
    test = pickle.load(f)

examples = [example[1] for example in test ]
labels = [l for example in test for l in example[2]]

train_examples = [ example[1] for example in train ]
train_example_labels = []
for table in train:
    col_labels = """"""
    for i, l in enumerate(table[2]):
        col_labels += f"""Column {i+1}: {labels_to_text[l]}\n"""
    train_example_labels.append(col_labels.strip())

In [8]:
train_examples

[['Column 1||\n2020-07-10||\n2016-04-08||\n2013-09-13||\n2016-08-05||\n2019-05-10||\n'],
 ['Column 1 || Column 2||\nThe Sleep Revolution || en-US||\nViva, Ame, Lidere || pt-BR||\nThe Sleep Revolution: Transforming Your Life, One Night at a Time || pt-BR||\nA Arte da Autoconfiança || pt-BR||\nO Poder da Ação || pt-BR||\n'],
 ['Column 1 || Column 2 || Column 3 || Column 4 || Column 5||\nHomer William Bedell Stanford W. Stanford || Paperback || Unlimited [Historical Fiction Book] ↠ The Odyssey, Book 1-12 - by Homer William Bedell Stanford W. Stanford ↠ || English || Homer William Bedell Stanford W. Stanford||\nDave Barry Ridley Pearson || Hardcover || ↠ The Bridge to Never Land || ☆ PDF Read by ☆ Dave Barry Ridley Pearson || English || Dave Barry Ridley Pearson||\nDonita K. Paul || Paperback || à DragonFire || ✓ PDF Download by ↠ Donita K. Paul || English || Donita K. Paul||\nArthur C. Clarke Gentry Lee || None || è Rama Revealed || ✓ PDF Download by ê Arthur C. Clarke Gentry Lee || Engli

In [9]:
train_example_labels

['Column 1: date',
 'Column 1: name of book\nColumn 2: language',
 'Column 1: name of person\nColumn 2: book format type\nColumn 3: name of book\nColumn 4: language\nColumn 5: organization',
 'Column 1: name of book\nColumn 2: offer item condition\nColumn 3: item availability\nColumn 4: price',
 'Column 1: language\nColumn 2: book format type',
 'Column 1: name of book\nColumn 2: book format type',
 'Column 1: name of book\nColumn 2: price',
 'Column 1: name of book\nColumn 2: price',
 'Column 1: name of person\nColumn 2: language\nColumn 3: book format type\nColumn 4: name of book\nColumn 5: organization',
 'Column 1: name of book\nColumn 2: price',
 'Column 1: book format type\nColumn 2: name of book',
 'Column 1: book format type\nColumn 2: name of book',
 'Column 1: name of book',
 'Column 1: name of book\nColumn 2: book format type',
 'Column 1: name of book',
 'Column 1: name of book\nColumn 2: description of book',
 'Column 1: name of book',
 'Column 1: name of book\nColumn 2: b

In [22]:
len(train)

44769

In [23]:
len(test)

110

In [7]:
labels_joined = ", ".join([labels_to_text[label] for label in list(set(labels))])
labels_joined

'identifier, description of book, time, work hours, email, day of week, name of local business, street address, description of event, description of movie, coordinate, event status, item availability, description of product, name of music album, category code, language, number, distance, name of person, description of hotel, url, name of museum, item list, energy, quantitative value, telephone, description of recipe, monetary amount, boolean, mass, name of creative work, currency, name of product, educational occupational credential, occupational experience requirements, sports team, unit code, price range, category, photograph, name of hotel, restricted diet, weight, rating, name of restaurant, name of recipe, country, description of job posting, opening hours, offer item condition, fax number, name of sports event, name of movie, name of job posting, name of place, locality of address, payment accepted, postal address, name of event, region of address, creative work, book format type

In [8]:
model_name = 'gpt-3.5-turbo-1106'
chat = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0, model=model_name)

## Choose setup: zero-shot, one-shot or five-shot

CTA TABLE

ZERO-SHOT

In [43]:
#role
nr="zero"
prompt_name = "r"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great data scientist. The best at what you do and your task is to annotate a given table with only one of the following labels that are separated with comma: {labels_joined}. Answer only with labels from the provided label set!"))
  
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))

    res = chat(messages)
    preds.append(res.content)

In [198]:
#role 
nr="zero"
prompt_name = "r4"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with only one of the following labels that are separated with comma: {labels_joined}. Answer with one of the labels from the provided label set!"))
  
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))

    res = chat(messages)
    preds.append(res.content)
    

In [None]:
#role 
nr="zero"
prompt_name = "r4.1"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}. Answer with ONLY ONE label from the provided label set for each Column!"))

    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))

    print(messages)
    res = chat(messages)
    preds.append(res.content)
    

In [253]:
#role 
nr="zero"
prompt_name = "r4.2"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))

    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))

    res = chat(messages)
    preds.append(res.content)
    

In [270]:
#role 
nr="zero"
prompt_name = "r4.1.DHM"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))

    messages.append(HumanMessage(content=f"Classify these table columns: {example}. Answer with ONLY ONE label from the provided label set for each Column!"))

    res = chat(messages)
    preds.append(res.content)
    

In [None]:
#role + instructions
nr = "zero"
prompt_name = "r+i"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"Your task is to classify the columns of a given table with only one of the following classes that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1. Look at the input given to you and make a table out of it. 2. Look at the cell values in detail. 3. For each column, select a class that best represents the meaning of all cells in the column. 4. Answer with the selected class for each columns with the format Column1: class."))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    
    print("Messages:", messages)
    res = chat(messages)
    
    print("Model Response:", res.content)
    preds.append(res.content)


In [318]:
#role + instructions
nr="zero"
#prompt_name = "r+i2"
prompt_name = "r+i2.2"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas."))

    #messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    messages.append(HumanMessage(content=f"Classify these table columns: {example}. Answer with ONLY ONE label from the provided label set for each Column!"))

    res = chat(messages)
    preds.append(res.content)
    

In [307]:
#role + instructions + step by step 
nr="zero"
#prompt_name = "r+i+s_b_s"
prompt_name = "r+i+s_b_s.2"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas."))
    messages.append(SystemMessage(content="Let's think step by step."))
    
    #messages.append(HumanMessage(content=f"Classify these table columns: {example}. Answer with ONLY ONE label from the provided label set for each Column!"))
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))

    res = chat(messages)
    preds.append(res.content)
    

In [11]:
#role + step by step + instructions 
nr="zero"
prompt_name = "r+s_b_s+i"


preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Let's think step by step."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas."))
   
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))

    res = chat(messages)
    preds.append(res.content)
    

In [355]:
#role + instructions
nr="zero"
#prompt_name = "r+i3"
prompt_name = "r+i3.1"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5.Ensure that you use only one label from the provided set for each column. If multiple labels seem applicable, prioritize the one that best represents the overall content of the column.")) 
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))

    res = chat(messages)
    preds.append(res.content)
    

In [389]:
#role + instructions + step by step + motivation 
nr="zero"
#prompt_name = "r+i2+m"
#prompt_name = "r+i2+s_b_s+m"
prompt_name = "r+i2+s_b_s+m.1"


preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    #messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))

    #messages.append(SystemMessage(content="Let's think step by step."))
    #messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))

    messages.append(SystemMessage(content="Let's think step by step. Your answer is very important. Take your time and think well before answering!"))
    
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))

    res = chat(messages)
    preds.append(res.content)
    

In [889]:
#CONTEXT

nr="zero"
prompt_name = "r+c"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content."))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    
    res = chat(messages)
    preds.append(res.content)

In [922]:
#CONTEXT
nr="zero"
prompt_name = "r+c1"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content. Furthermore, classification involves assigning predefined categories or labels to data based on its features or attributes.  Your task is the same, to predict the column types of a given table with only one label per column from the provided label-set!"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    
    res = chat(messages)
    preds.append(res.content)

In [979]:
#CONTEXT

nr="zero"
prompt_name = "r+c1.1.1"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content.  Your task is the same, to predict the column types of a given table with only one label per column from the provided label-set!")) 
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    
    res = chat(messages)
    preds.append(res.content)

In [1021]:
#CONTEXT

nr="zero"
#prompt_name = "r+i2+c1+m"
prompt_name = "r+i2+c1.1.1+m"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    #messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content. Furthermore, classification involves assigning predefined categories or labels to data based on its features or attributes."))
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content.  Your task is the same, to predict the column types of a given table with only one label per column from the provided label-set!"))
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))

    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    
    res = chat(messages)
    preds.append(res.content)

In [950]:
#CONTEXT

nr="zero"
prompt_name = "r+i2+c1"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content. Furthermore, classification involves assigning predefined categories or labels to data based on its features or attributes."))

    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    
    res = chat(messages)
    preds.append(res.content)

In [1179]:
#CONTEXT (example context) 

nr="zero"
prompt_name = "r+c.example"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"CONTEXT: Here is an example of how Column Type Annotation task is solved: 'Classify these table columns: Columm1: Dog, Cat, Dog.  Column2: bread, pasta, meat, beef.'"
                                  "First we check Columm1: Dog, Cat, Dog. Analyze, and predict: pets."   
                                  "Now we check Column2: bread, pasta, meat, beef. Analyze, and predict: food"
                                  "Answer: Column 1 Name : Pets, Column 2 Name: Food"))

    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    
    res = chat(messages)
    preds.append(res.content)

In [54]:
#role + step by step
nr="zero"
prompt_name = "r+s_b_s"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Let's think step by step."))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)
    

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


In [12]:
# Generating tables to use for "Generated Knowledge Prompt"  
import openai
import random

generated_examples = []
generated_examples_labels = []

for label in labels:

    for table_number in range(1, 6):
        messages = []
        
        prompt = f"Generate 1 table with 4 random columns and 5 rows. Include one column about {label}. Please return only the values, no need to explain."

        messages.append({"role": "system", "content": prompt})
       
        response = openai.ChatCompletion.create(
            model='gpt-3.5-turbo-0301',
            messages=messages
        )
        
        generated_content = response['choices'][0]['message']['content'].strip()
        generated_examples.append(generated_content)
        generated_examples_labels.append(label)


In [13]:
""" file_name=f'GKP-Input/Generated-Tables.pkl'
f = open(file_name,'wb')
pickle.dump(generated_examples, f)
f.close() """

In [14]:
""" file_name=f'GKP-Input/Generated-Table-Labels.pkl'
f = open(file_name,'wb')
pickle.dump(generated_examples_labels, f)
f.close() """

In [None]:
with open(f'GKP-Input/Generated-Tables.pkl', "rb") as f:
    tables = pickle.load(f)
tables    

In [None]:
with open(f'GKP-Input/Generated-Table-Labels.pkl', "rb") as f:
    labels = pickle.load(f)
labels    

ONE SHOT

In [529]:
#role
import random

nr="one"
#prompt_name = "r4.1.DHM"
prompt_name = "r4.1.DHM.1"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    
    index = random.randint(0, len(train_examples)-1)
    #messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}.Answer with ONLY ONE label from the provided label set for each Column!"))
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}. Answer with ONLY ONE label from the provided label set for each Column!"))

    res = chat(messages)
    preds.append(res.content)
    

In [489]:
#role + instructions 
import random

nr="one"
prompt_name = "r+i"

preds = []
for example in examples:
    messages = []
    

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1. Look at the input given to you and make a table out of it. 2. Look at the cell values in detail. 3. For each column, select a class that best represents the meaning of all cells in the column. 4. Answer with the selected class for each columns with the format Column1: class."))
    
    
    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [502]:
#role + instructions
import random

nr="one"
prompt_name = "r+i2"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas."))
    

    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [78]:
#role + step by step
import random 

nr="one"
prompt_name = "r+s_b_s"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Let's think step by step."))
    
    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [569]:
#role + instructions + step by step
import random

nr="one"
prompt_name = "r+i+s_b_s.2"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas."))
    messages.append(SystemMessage(content="Let's think step by step."))
    
    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)
    

In [600]:
#role + instructions + motivation
import random

nr="one"
prompt_name = "r+i2+m"


preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))

    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)
    

In [861]:
#CONTEXT
import random

nr="one"
prompt_name = "r+c"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content."))
    
    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [1032]:
#CONTEXT (example context)
import random

nr="one"
prompt_name = "r+ctest"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    index = random.randint(0, len(train_examples)-1)
    messages.append(SystemMessage(content=f"CONTEXT: Classify these table columns: {train_examples[index]}"))
    
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [11]:
#CONTEXT (example context)  2.
import random

nr="one"
prompt_name = "r+ctest.2"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    index = random.randint(0, len(train_examples)-1)
    messages.append(SystemMessage(content=f"Classify these table columns: {train_examples[index]}"))
    
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [1048]:
#CONTEXT + instructions + motivation 
import random

nr="one"
prompt_name = "r+i2+c1+m"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}.")) 
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content. Furthermore, classification involves assigning predefined categories or labels to data based on its features or attributes.")) 
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))
    
    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [31]:
#CONTEXT + instructions + motivation    .2
import random

nr="one"
prompt_name = "r+i2+c1+m..2"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}.")) 
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content=f"Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content. Furthermore, classification involves assigning predefined categories or labels to data based on its features or attributes.")) 
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))
    
    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


In [None]:
#role + step by step test
import random

nr="one"
prompt_name = "r+s_b_s.test"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Let's think step by step."))

    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    middle_res = chat(messages)
    messages.append(middle_res.content)
    print(messages)
    

In [None]:
#role + step by step test
import random

nr="one"
prompt_name = "r+s_b_s.test"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Let's think step by step."))

    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    middle_res = chat(messages)
    messages.append(middle_res.content)
   
    messages.append(HumanMessage(content=f"The actual labels were: {train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}."))
    
    print(messages)
    res = chat(messages)
    print(res)
    preds.append(res.content)
    print(preds)

In [9]:
with open('GKP-Input/Generated-Tables.pkl', "rb") as f:
    train_tables = pickle.load(f)
with open('GKP-Input/Generated-Table-Labels.pkl', "rb") as f:
    train_labels = pickle.load(f)

In [17]:
#Role + instructions + context + motivation + Generated Knowledge (GKP)
import random

nr="one"
prompt_name = "r+i2+c1+m(GKP)"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}.")) 
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content. Furthermore, classification involves assigning predefined categories or labels to data based on its features or attributes.")) 
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))
    
    index = random.randint(0, len(train_tables)-1)
    messages.append(HumanMessage(content=f"This is an example of a table that includes a column with the label '{train_labels[index]}':\n{train_tables[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [18]:
messages

[SystemMessage(content='You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: name of recipe, coordinate, name of place, opening hours, time, description of book, postal address, description of product, name of local business, gender type, name of hotel, product model, name of TV episode, date, boolean, item list, rating, name of event, organization, brand, description of movie, creative work series, review, energy, occupational experience requirements, educational occupational credential, unit text, locality of address, location feature, language, fax number, email, delivery method, work hours, quantitative value, weight, street address, name of movie, photograph, book format type, offer item condition, name of book, name of music album, description of event, distance, name of sports event, name of product, date and time, name of museum, description of recipe, restricte

In [19]:
#Role + instructions + context + motivation + Generated Knowledge (GKP)
import random

nr="one"
prompt_name = "r+i2+c1+m+GKP"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}.")) 
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content. Furthermore, classification involves assigning predefined categories or labels to data based on its features or attributes.")) 
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))
    
    index_gen = random.randint(0, len(train_tables)-1)
    messages.append(HumanMessage(content=f"This is an example of a table that includes a column with the label '{train_labels[index_gen]}':\n{train_tables[index_gen]}"))
   
    index = random.randint(0, len(train_examples)-1)
    messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
    messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    #res = chat(messages)
    #preds.append(res.content)

In [20]:
messages

[SystemMessage(content='You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: identifier, description of book, time, work hours, email, day of week, name of local business, street address, description of event, description of movie, coordinate, event status, item availability, description of product, name of music album, category code, language, number, distance, name of person, description of hotel, url, name of museum, item list, energy, quantitative value, telephone, description of recipe, monetary amount, boolean, mass, name of creative work, currency, name of product, educational occupational credential, occupational experience requirements, sports team, unit code, price range, category, photograph, name of hotel, restricted diet, weight, rating, name of restaurant, name of recipe, country, description of job posting, opening hours, offer item condition, fax number,

FIVE SHOT

In [644]:
#role
import random

nr="five"
#prompt_name = "r4.1.DHM"
prompt_name = "r4.1.DHM.1"

preds = []
for example in examples:
    messages = []
    
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    
    
    for i in range(0,5):
        index = random.randint(0, len(train_examples)-1)
        messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    #messages.append(HumanMessage(content=f"Classify these table columns: {example}. Answer with ONLY ONE label from the provided label set for each Column!"))
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [792]:
import random

nr="five"
#prompt_name = "r5"
prompt_name = "r6"

preds = []
for example in examples:
    messages = []
    
    #messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"You are THE BEST Data Scientist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
   
    for i in range(0,5):
        index = random.randint(0, len(train_examples)-1)
        messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [None]:
import random

nr="five"
#prompt_name = "r+i2"
#prompt_name = "r5+i2"
prompt_name = "r6+i2"

preds = []
for example in examples:
    messages = []
    
    #messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    #messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"You are THE BEST Data Scientist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas."))
    

    for i in range(0,5):
        index = random.randint(0, len(train_examples)-1)
        messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [None]:
#role + instructions + step by step 
import random

nr="five"
#prompt_name = "r+i+s_b_s.2"
prompt_name = "r5+i+s_b_s.2"

preds = []
for example in examples:
    messages = []
    
    #messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas."))
    messages.append(SystemMessage(content="Let's think step by step."))
    
    for i in range(0,5):
        index = random.randint(0, len(train_examples)-1)
        messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)
    

In [None]:
#role + instructions + motivation
import random

nr="five"
#prompt_name = "r+i2+m"
prompt_name = "r5+i2+m"
#prompt_name = "r6+i2+m"

preds = []
for example in examples:
    messages = []
    
    #messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to annotate a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    #messages.append(SystemMessage(content=f"You are THE BEST Data Scientist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))

    for i in range(0,5):
        index = random.randint(0, len(train_examples)-1)
        messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)
    

In [None]:
#CONTEXT
import random

nr="five"
prompt_name = "r+c"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content."))
    
    for i in range(0,5):
        index = random.randint(0, len(train_examples)-1)
        messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [None]:
#CONTEXT (example context)
import random

nr="five"
prompt_name = "r+ctest"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}."))
   
    for i in range(0,5):
        index = random.randint(0, len(train_examples)-1)
        messages.append(SystemMessage(content=f"CONTEXT: Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [None]:
#CONTEXT + instructions + motivation 
import random

nr="five"
prompt_name = "r+i2+c1+m"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}.")) 
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content. Furthermore, classification involves assigning predefined categories or labels to data based on its features or attributes.")) 
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))
    
    for i in range(0,5):
        index = random.randint(0, len(train_examples)-1)
        messages.append(HumanMessage(content=f"Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [None]:
#CONTEXT + instructions + motivation 
import random

nr="five"
prompt_name = "r+i2+ctest+m"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}.")) 
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))
    
    for i in range(0,5):
        index = random.randint(0, len(train_examples)-1)
        messages.append(SystemMessage(content=f"CONTEXT: Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [None]:
#Role + instructions + step by step + Generated Knowledge (GKP)
import random

nr="five"
prompt_name = "r5+i+s_b_s.2+GKP"

preds = []
for example in examples:
    messages = []
    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}.")) 
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas."))
    messages.append(SystemMessage(content="Let's think step by step."))

    for i in range(0,5):
        index_gen = random.randint(0, len(train_tables)-1)
        messages.append(HumanMessage(content=f"This is an example of a table that includes a column with the label '{train_labels[index_gen]}':\n{train_tables[index_gen]}"))
        index = random.randint(0, len(train_examples)-1)
        messages.append(SystemMessage(content=f"Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

In [26]:
#Role + instructions + context + motivation + Generated Knowledge (GKP)
import random

nr="five"
prompt_name = "r+i2+c1+m+GKP"

preds = []
for example in examples:
    messages = []

    messages.append(SystemMessage(content=f"You are a great Table Annotation Specialist and your task is to classify the columns of a given table with ONLY ONE of the following labels that are separated with comma: {labels_joined}.")) 
    messages.append(SystemMessage(content="Your instructions are: 1.Review the provided input and organize it into a table format. 2.Carefully examine the values within each cell of the table. 3.For every column in the table, choose a single label that best captures the overall meaning of the cells in that column. 4.Respond with your selected label for each column, following the format: 'Column1: SelectedLabel, Column2: SelectedLabel, ...'. 5. Ensure that you use only one label from the provided set, and separate your responses with commas.")) 
    messages.append(SystemMessage(content=f"CONTEXT: Column Type Annotation is a sub-task of Table Annotation and involves categorizing each column in a table based on its content. Furthermore, classification involves assigning predefined categories or labels to data based on its features or attributes.")) 
    messages.append(SystemMessage(content="Your answer is very important. Take your time and think well before answering!"))

    for i in range(0,5):
        index_gen = random.randint(0, len(train_tables)-1)
        messages.append(HumanMessage(content=f"This is an example of a table that includes a column with the label '{train_labels[index_gen]}':\n{train_tables[index_gen]}"))
        index = random.randint(0, len(train_examples)-1)
        messages.append(SystemMessage(content=f"Classify these table columns: {train_examples[index]}"))
        messages.append(AIMessage(content=f"{train_example_labels[index]}"))
    
    messages.append(HumanMessage(content=f"Classify these table columns: {example}"))
    res = chat(messages)
    preds.append(res.content)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-3.5-turbo-1106 in organization org-JnRe4IF9kM1kFkJzhKlurHyV on tokens per min (TPM): Limit 80000, Used 78321, Requested 3284. Please try again in 1.203s. Visit https://platform.openai.com/account/rate-limits to learn more..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-3.5-turbo-1106 in organization org-JnRe4IF9kM1kFkJzhKlurHyV on tokens per min (TPM): Limit 80000, Used 78919, Requested 1653. Please try again in 429ms. Visit https://platform.openai.com/account/rate-limits to learn more..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for gpt-3.5-turbo-1106 in organization 

In [15]:
#zero-shot
file_name=f'Predictions/{model_name}/chat-table-{prompt_name}-{nr}-shot.pkl'
f = open(file_name,'wb')
pickle.dump(preds,f)
f.close()

In [27]:
file_name=f'Predictions/{model_name}/{nr}-shot/chat-table-{prompt_name}-{nr}-shot.pkl'
f = open(file_name,'wb')
pickle.dump(preds,f)
f.close()

In [28]:
preds

['Column 1: name of restaurant\nColumn 2: telephone',
 'Column 1: opening hours',
 'Column 1: country\nColumn 2: name of company',
 'Column 1: description of movie\nColumn 2: duration\nColumn 3: rating\nColumn 4: name of movie',
 "I'm sorry, but I cannot provide a classification for the given input as it does not appear to be in a tabular format. If you have a table that needs classification, please provide it in a tabular format and I'd be happy to assist with the classification.",
 'Column 1: name of creative work',
 'Column 1: quantitative value\nColumn 2: name of product\nColumn 3: offer item condition',
 'Column 1: name of place',
 'Column 1: description of event\nColumn 2: name of person\nColumn 3: description of place',
 'Column 1: name of place\nColumn 2: postal address\nColumn 3: name of place\nColumn 4: description of hotel',
 'Column 1: name of recipe\nColumn 2: quantitative value\nColumn 3: quantitative value\nColumn 4: quantitative value\nColumn 5: quantitative value\nColu

## Evaluation

In [31]:
predictions = []
i=0
for j, table_preds in enumerate(preds):
   
    table_number = len(test[j][2])
    
    if "Class:" in table_preds:
        table_preds = table_preds.split("Class:")[1]
      

    if ":" in table_preds or "-" in table_preds:
        if ":" in table_preds:
            separator = ":"
            start = 1
            end = table_number+1
        else:
            separator = "-"  
            start = 1
            end = table_number+1
    else:
        separator = ","
        start = 0
        end = table_number
        
    col_preds = table_preds.split(separator)[start:end]
    
    for pred in col_preds:
        i+=1
        
        if "\n" in pred:
            pred = pred.split('\n')[0].strip()
        if "," in pred:
            pred = pred.split(",")[0].strip()
        if '(' in pred:
            pred = pred.split("(")[0].strip()
        if '.' in pred:
            pred = pred.split(".")[0].strip()
        pred = pred.strip().lower()
        
        if pred in text_to_label:
            predictions.append(text_to_label[pred])
        else:
            print(f"For test example {i} out of label space prediction: {pred}")
            predictions.append('-')
        
    if len(col_preds) < table_number:
        for m in range(0, table_number-len(col_preds)):
            predictions.append('-')
            i+=1

For test example 5 out of label space prediction: name of company
For test example 10 out of label space prediction: i'm sorry
For test example 11 out of label space prediction: but i cannot provide a classification for the given input as it does not appear to be in a tabular format
For test example 19 out of label space prediction: description of place
For test example 109 out of label space prediction: i'm sorry
For test example 115 out of label space prediction: i'm sorry
For test example 175 out of label space prediction: name of tv episode
For test example 176 out of label space prediction: name of tv series
For test example 177 out of label space prediction: name of author
For test example 282 out of label space prediction: name of creative work series
For test example 293 out of label space prediction: name of service
For test example 299 out of label space prediction: name of music artist


In [32]:
predictions[:15]

['Restaurant/name',
 'telephone',
 'openingHours',
 'Country',
 '-',
 'Movie/description',
 'Duration',
 'Rating',
 'Movie/name',
 '-',
 '-',
 'CreativeWork/name',
 'QuantitativeValue',
 'Product/name',
 'OfferItemCondition']

### Calculate Precision, Recall, Macro-F1 and Micro-F1

In [35]:
def calculate_f1_scores(y_tests, y_preds, num_classes, types):

    y_tests = [types.index(y) for y in y_tests]
    y_preds = [types.index(y) for y in y_preds]
    
    cm = np.zeros(shape=(num_classes,num_classes))
    
    for i in range(len(y_tests)):
        cm[y_preds[i]][y_tests[i]] += 1
        
    report = {}
    
    for j in range(len(cm[0])):
        report[j] = {}
        report[j]['FN'] = 0
        report[j]['FP'] = 0
        report[j]['TP'] = cm[j][j]

        for i in range(len(cm)):
            if i != j:
                report[j]['FN'] += cm[i][j]
        for k in range(len(cm[0])):
            if k != j:
                report[j]['FP'] += cm[j][k]

        precision = report[j]['TP'] / (report[j]['TP'] + report[j]['FP'])
        recall = report[j]['TP'] / (report[j]['TP'] + report[j]['FN'])
        f1 = 2*precision*recall / (precision + recall)
        
        if np.isnan(f1):
            f1 = 0
        if np.isnan(precision):
            f1 = 0
        if np.isnan(recall):
            f1 = 0

        report[j]['p'] =  precision
        report[j]['r'] =  recall
        report[j]['f1'] = f1
    
    all_fn = 0
    all_tp = 0
    all_fp = 0

    for r in report:
        if r != num_classes-1:
            all_fn += report[r]['FN']
            all_tp += report[r]['TP']
            all_fp += report[r]['FP']
        
    class_f1s = [ report[class_]['f1'] for class_ in report]
    class_p = [ 0 if np.isnan(report[class_]['p']) else report[class_]['p'] for class_ in report]
    class_r = [ 0 if np.isnan(report[class_]['r']) else report[class_]['r'] for class_ in report]
    macro_f1 = sum(class_f1s[:-1]) / (num_classes-1)
    
    p =  sum(class_p[:-1]) / (num_classes-1)
    r =  sum(class_r[:-1]) / (num_classes-1)
    micro_f1 = all_tp / ( all_tp + (1/2 * (all_fp + all_fn) )) 
    
    per_class_eval = {}
    for index, t in enumerate(types[:-1]):
        per_class_eval[t] = {"Precision":class_p[index], "Recall": class_r[index], "F1": class_f1s[index]}
    
    evaluation = {
        "Micro-F1": micro_f1,
        "Macro-F1": macro_f1,
        "Precision": p,
        "Recall": r
    }
    
    return [ evaluation, per_class_eval]


In [36]:
types = list(set(labels))
types = types + ["-"] if "-" in predictions else types
evaluation, per_class_eval = calculate_f1_scores(labels, predictions, len(types), types)

  precision = report[j]['TP'] / (report[j]['TP'] + report[j]['FP'])
  f1 = 2*precision*recall / (precision + recall)
  recall = report[j]['TP'] / (report[j]['TP'] + report[j]['FN'])


In [38]:
evaluation

{'Micro-F1': 0.5783521809369951,
 'Macro-F1': 0.5418086280806206,
 'Precision': 0.6053826130366565,
 'Recall': 0.5642566782810684}

In [39]:
per_class_eval

{'PostalAddress': {'Precision': 0.6666666666666666,
  'Recall': 0.5,
  'F1': 0.5714285714285715},
 'LocationFeatureSpecification': {'Precision': 0, 'Recall': 0.0, 'F1': 0},
 'Product/description': {'Precision': 0.2,
  'Recall': 0.5,
  'F1': 0.28571428571428575},
 'URL': {'Precision': 0.3333333333333333,
  'Recall': 0.75,
  'F1': 0.46153846153846156},
 'addressRegion': {'Precision': 1.0, 'Recall': 0.6666666666666666, 'F1': 0.8},
 'streetAddress': {'Precision': 0.5, 'Recall': 0.5, 'F1': 0.5},
 'OccupationalExperienceRequirements': {'Precision': 1.0,
  'Recall': 0.6666666666666666,
  'F1': 0.8},
 'Mass': {'Precision': 0.0, 'Recall': 0.0, 'F1': 0},
 'ItemList': {'Precision': 0, 'Recall': 0.0, 'F1': 0},
 'Date': {'Precision': 0.6666666666666666,
  'Recall': 0.8571428571428571,
  'F1': 0.75},
 'Brand': {'Precision': 0.0, 'Recall': 0.0, 'F1': 0},
 'Country': {'Precision': 0.8571428571428571,
  'Recall': 1.0,
  'F1': 0.923076923076923},
 'Place/name': {'Precision': 0.21428571428571427,
  'Reca

## Error Analysis

In [43]:
errors = 0
for i in range(len(predictions)):
    if predictions[i] != labels[i]:
        errors += 1
        print(f"Predicted as {predictions[i]} when it was {labels[i]}")
errors

Predicted as Restaurant/name when it was Place/name
Predicted as - when it was LocalBusiness/name
Predicted as Movie/description when it was Rating
Predicted as Rating when it was Number
Predicted as - when it was Person/name
Predicted as - when it was Person/name
Predicted as CreativeWork/name when it was MusicArtistAT
Predicted as QuantitativeValue when it was weight
Predicted as Product/name when it was category
Predicted as Place/name when it was CreativeWork
Predicted as Event/description when it was CreativeWork/name
Predicted as - when it was CreativeWork
Predicted as Place/name when it was priceRange
Predicted as Place/name when it was Hotel/name
Predicted as Recipe/name when it was RestrictedDiet
Predicted as QuantitativeValue when it was Mass
Predicted as QuantitativeValue when it was Mass
Predicted as QuantitativeValue when it was Mass
Predicted as QuantitativeValue when it was Mass
Predicted as QuantitativeValue when it was Mass
Predicted as QuantitativeValue when it was Ma

139

### Re-load previous preds files

In [None]:
with open(f'Predictions/{model_name}/chat-table-{prompt_name}-{nr}-shot.pkl', "rb") as f:
    preds = pickle.load(f)

In [112]:
with open(f'Predictions/{model_name}/{nr}-shot/chat-table-{prompt_name}-{nr}-shot.pkl', "rb") as f:
    preds = pickle.load(f)

In [None]:
preds