In [1]:
from datasets import load_dataset
import time
import pandas as pd
import re
import os
import json
import requests
import torch
import numpy as np

In [2]:
# 데이터셋 로드
dataset = load_dataset("Stanford/wikitablequestions", trust_remote_code=True)

# 데이터셋 구조 확인
print(dataset)

Using the latest cached version of the module from C:\Users\ripx0\.cache\huggingface\modules\datasets_modules\datasets\Stanford--wikitablequestions\4845d83f334ed5e4a8e0420731e64d57f696feb62e7d3a47b84037864fb8317c (last modified on Sun Nov  3 18:33:20 2024) since it couldn't be found locally at Stanford/wikitablequestions, or remotely on the Hugging Face Hub.


DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answers', 'table'],
        num_rows: 11321
    })
    test: Dataset({
        features: ['id', 'question', 'answers', 'table'],
        num_rows: 4344
    })
    validation: Dataset({
        features: ['id', 'question', 'answers', 'table'],
        num_rows: 2831
    })
})


In [3]:
class WikiTQ:
    def __init__(self, dataset, index):
        """데이터셋과 인덱스를 받아 객체를 초기화합니다."""
        self.dataset = dataset
        self.index = index
        self._extract_data()  # 내부 데이터 추출

    def _extract_data(self):
        """데이터에서 테이블과 관련된 정보를 추출하여 저장"""
        entry = self.dataset['train'][self.index]

        # 테이블 정보 추출 및 DataFrame 생성
        self.table_header = entry['table']['header']
        self.table_rows = entry['table']['rows']
        self.df = pd.DataFrame(self.table_rows, columns=self.table_header)

        # 기타 정보 추출
        self.question = entry['question']
        self.answers = entry['answers']

    # Getter 메서드들
    def get_table(self):
        """테이블(DataFrame) 반환"""
        return self.df

    def get_question(self):
        """질문 반환"""
        return self.question

    def get_answers(self):
        """정답 목록 반환"""
        return self.answers

    def display_all(self):
        """모든 정보를 출력"""
        print("Table DataFrame:")
        print(self.df)
        print("\nQuestion:", self.question)
        print("\nAnswers:", self.answers)

In [11]:
import openai
import pandas as pd
from io import StringIO


# Set OpenAI client
client = openai.Client(api_key='API KEY')
model = "gpt-4o-mini-2024-07-18"
# Create a text file to store results
output_file = 'table_comparison.txt'


In [None]:

index_list = [i for i in range(1, 11)]
# df_list = []
# question_list = []
# answer_list = []
# # 객체 생성
# missed_count = 0
# correct_count = 0

with open(output_file, 'w') as f:
    for index in index_list:
        wiki_tq = WikiTQ(dataset, index)

        # 메서드 사용 예시
        df = wiki_tq.get_table()
        question = wiki_tq.get_question()
        answer = wiki_tq.get_answers()
        
        # print(question)
        # print(df)
        # print(answer)
        
        table_string = df.to_string(index=False)
        
        prompt = f"""
        I have a table extracted from a dataset. The column headers are misaligned and do not clearly match the values in the rows. Here is an example of the table:\n\n
        Original Table:\n
        col: Rank | Cyclist | Team | Time | UCI ProTour; Points\n
        row1: 1 | Alejandro Valverde (ESP) | Caisse d’Epargne | 5h 29’ 10" | 40\n
        row 2 : 2 | Alexandr Kolobnev (RUS) | Team CSC Saxo Bank | s.t. | 30\n
        row 3 : 3 | Davide Rebellin (ITA) | Gerolsteiner | s.t. | 25\n\n
        Desired Restructured Table:\n
        Rank | Cyclist (Country) | Team | Time | UCI ProTour; Points\n
        row1: 1 | Alejandro Valverde (ESP) | Caisse d’Epargne | 5h 29’ 10" | 40\n
        row 2 : 2 | Alexandr Kolobnev (RUS) | Team CSC Saxo Bank | s.t. | 30\n
        row 3 : 3 | Davide Rebellin (ITA) | Gerolsteiner | s.t. | 25\n\n
        Your task is to analyze the table, identify misaligned headers, and adjust them to match the structure of the data. Maintain the integrity of the original values and ensure that the output table is clean and readable.\n
        Now, please restructure the following table:\n\n

        {table_string}\n\n
        Output the corrected table only, without additional explanations.
        """
        
        # Call the OpenAI model
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful data analyst."},
                {"role": "user", "content": prompt}
            ]
        )
        
        # Extract and display the response
        structured_table = response.choices[0].message.content
        
        try:
            # Convert the structured table to DataFrame
            modified_df = pd.read_csv(StringIO(structured_table), sep="|")
            modified_df.columns = [col.strip() for col in modified_df.columns]
        except Exception as e:
            print(f"Failed to parse the restructured table for index {index}. Error:", e)
            modified_df = pd.DataFrame()  # Placeholder if parsing fails

        # Write the original and restructured tables to the text file
        f.write(f"\n--- Table {index} ---\n")
        f.write("Original Table:\n")
        f.write(table_string)
        f.write("\n\nRestructured Table:\n")
        f.write(structured_table)
        f.write("\n\n")
    