In [None]:
import pandas as pd
from anthropic import Anthropic
import openai
import os

# API 초기화
anthropic = Anthropic(api_key='api key 입력')
openai.api_key = 'api key 입력'

# 전역 데이터프레임 초기화
columns = ['section', 'chemical composition', 'source', 'exp. calc', 'temperature', 'temp_unit', 'conductivity', 'unit', 'activation e', 'structure type', 'chemical family', 'mobile ion', 'Match Level', 'Reliability', 'Notes']
df = pd.DataFrame(columns=columns)
property_name = 'Ionic Conductivity'

definition = """
Ionic conductivity is a measure of a material's ability to conduct electric current through the movement of ions. It is a critical property in the study of electrolytes, solid-state ionic conductors, and other materials used in batteries, fuel cells, and other electrochemical devices.

### Definition:
Ionic conductivity (\(\sigma\)) quantifies how well ions can move through a material under the influence of an electric field. It is defined as the electric current density (current per unit area) divided by the electric field strength. The higher the ionic conductivity, the more efficiently the material can transport ions.

### Formula:
\[ \sigma = \frac{J}{E} \]

Where:
- \(\sigma\) is the ionic conductivity.
- \(J\) is the current density (the current per unit area).
- \(E\) is the electric field strength.

### Unit:
The unit of ionic conductivity is siemens per meter (S/m).

In some contexts, especially for materials with low ionic conductivities, it might also be expressed in microsiemens per centimeter (\(\mu S/cm\)) or millisiemens per centimeter (mS/cm). 

1 S/m = 10,000 \(\mu S/cm\) = 10 mS/cm
"""

def extract_data(model, text, property_name, definition, doi):

    prompt = f"""Analyze the uploaded section of scientific paper {text} on {property_name} with {definition} and create a table with the following columns, suitable for use in a deep learning regression model:
    section | chemical composition | source | exp. calc | temperature | conductivity | unit | activation e | structure type | chemical family | mobile ion
    Follow these guidelines:
    
    In the 'section' column, specify where the data comes from (e.g., 'Results and Discussion (Fig. 4)', 'Results and Discussion (text)', or both if applicable).
    For 'source', use {doi}.
    Indicate whether the data is experimental ('exp') or calculated ('calc').
    Provide temperature values and specify the unit in 'temp_unit' (e.g., °C or K).
    Give conductivity values in scientific notation when appropriate. 
    Specify the unit for conductivity.
    Provide activation energy values.
    For 'structure type', categorize according to the following options:
    Rocksalt, Anti-Perovskite, Argyrodite, LISICON, Zircon, Garnet, Thio-LISICON, Perovskite, NASICON, Glass, Lysonite, Olivine, Phenakite, Glass-Ceramic, Other
    If the structure doesn't fit these categories or if more detail is available, provide additional information (e.g., crystal system and space group).
    Specify the chemical family (e.g., 'thiophosphate').
    Identify the mobile ion (e.g., 'Li+').
    Do not generate any content that cannot be verified from the text.
    
    When extracting data:
    
    Ensure all numerical values are provided in a consistent format suitable for machine learning models.
    If a range of values is given, provide the mean value and note the range in a separate column or comment.
    For categorical data (like structure type or chemical family), ensure consistent terminology is used across all entries.
    If certain data points are missing, mark them as 'N/A' rather than leaving the cell empty.
    If the paper provides multiple data points for the same material under different conditions, include all of them as separate entries.
    
    Include all relevant data points from the paper, even if they represent different experimental conditions for the same material. If any information is missing or unclear, indicate this in the table.
    After creating the table, provide a brief summary of any key findings or notable aspects of the ionic conductivity data presented in the paper. Also, note any missing or unclear information in the paper regarding the requested data points.
    Finally, suggest any additional features or data transformations that might be useful for a deep learning regression model focusing on predicting ionic conductivity based on material properties."""
    
    if model == 'claude':
        response = anthropic.messages.create(
            model="claude 모델 입력",
            max_tokens=token 입력,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content
    elif model == 'gpt':
        response = openai.ChatCompletion.create(
            model="gpt 모델 입력",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=token 입력
        )
        return response.choices[0].message['content']

def compare_results(original_text, claude_result, gpt_result, property_name, definition, doi, preferred_model='gpt'):

    prompt =  f""" Compare the data extracted by Claude and GPT from the same text. Present the final data table in the following format:
    
    | section | chemical composition | source | exp. calc | temperature | temp_unit | conductivity | unit | activation e | structure type | chemical family | mobile ion | Match Level | Reliability | Notes |
    |---------|----------------------|--------|-----------|-------------|-----------|--------------|------|--------------|----------------|-----------------|------------|-------------|-------------|-------|
    | {{section1}} | {{composition1}} | {doi} | {{exp_calc1}} | {{temperature1}} | {{temp_unit1}} | {{conductivity1}} | {{unit1}} | {{activation_e1}} | {{structure_type1}} | {{chemical_family1}} | {{mobile_ion1}} | {{Match Level}} | {{High/Medium/Low}} | {{Additional notes if needed}} |
    | {{section2}} | {{composition2}} | {doi} | {{exp_calc2}} | {{temperature2}} | {{temp_unit2}} | {{conductivity2}} | {{unit2}} | {{activation_e2}} | {{structure_type2}} | {{chemical_family2}} | {{mobile_ion2}} | {{Match Level}} | {{High/Medium/Low}} | {{Additional notes if needed}} |
    ...
    
    Original Text:
    {original_text}
    
    Claude's Result:
    {claude_result}
    
    GPT's Result:
    {gpt_result}
    
    Instructions:
    1. Carefully compare the original text with the results from both Claude and GPT to impove results by identifing missing data of {property_name} with {definition} and correcting inaccurate information.
    2. If you find any data in the original text that neither model extracted, add it to the table and note this in the 'Notes' column.
    3. Evaluate the match level using these criteria:
       - Exact Match: The data from both models are identical in words and numbers
       - Semantic Match: The expressions differ but convey the same information
       - Partial Match: Some information matches, but there are differences
       - Mismatch: The data from the two models are different or contradictory
       - Missing: Data present in the original text but not extracted by either model
    4. Determine the Final Value as follows:
       - For Exact or Semantic Match: Choose either model's result
       - For Partial Match: Select the more accurate or complete information
       - For Mismatch: Use the result from {preferred_model.capitalize()}
       - For Missing data: Use the information from the original text
    5. Assess Reliability:
       - High: Exact or Semantic Match, consistent with the original text
       - Medium: Partial Match or only one model matches the original text
       - Low: Mismatch, but {preferred_model.capitalize()}'s result is chosen
       - Low: Missing data that had to be manually added from the original text
    6. In the Notes column:
       - Briefly mention any discrepancies, additional context, or reasoning behind the decision
       - For missing data, indicate that it was not extracted by either model and had to be manually added
    7. Create a row for each unique composition, including any that were missed by both models but present in the original text. Use 'No data' if information is genuinely missing from the original text.
    8. Ensure that all numerical data extracted is accurate and correctly formatted for use in deep learning models.
    9. For the 'structure type' column, categorize according to the following options:
       Rocksalt, Anti-Perovskite, Argyrodite, LISICON, Zircon, Garnet, Thio-LISICON, Perovskite, NASICON, Glass, Lysonite, Olivine, Phenakite, Glass-Ceramic, Other
       If the structure doesn't fit these categories or if more detail is available, provide additional information (e.g., crystal system and space group).
    """
    
    response = anthropic.messages.create(
        model='claude 모델 입력',
        max_tokens=token 입력,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text

def parse_table(table_string, source):
    lines = table_string.strip().split('\n')
    data = []
    for line in lines:
        if '|' in line:
            columns_text = line.split('|')
            if len(columns_text) >= 15:
                new_col = [i.strip() for i in columns_text[1:16]]
                new_col[2] = source  # 파일 이름을 source로 설정
                data.append(new_col)
    return data[2:]

def update_dataframe(new_data):
    global df
    new_df = pd.DataFrame(new_data, columns=columns)
    df = pd.concat([df, new_df], ignore_index=True)
    df = df[(df.conductivity != "No data") & (df.conductivity != "N/A")]
    df.to_csv('추출된 파일(csv형태)', index=False)

def process_section(section_text, section_name, property_name, definition, doi, preferred_model='gpt'):
    claude_result = extract_data('claude', section_text, property_name, definition, doi)
    gpt_result = extract_data('gpt', section_text, property_name, definition, doi)
    compared_result = compare_results(section_text, claude_result, gpt_result, property_name, definition, doi, preferred_model)
    parsed_result = parse_table(compared_result, doi)
    update_dataframe(parsed_result)

# 폴더 내 모든 텍스트 파일을 처리하는 함수 (날짜 폴더 추가)
def process_all_files_in_folder(folder_path, property_name, definition, preferred_model='gpt'):
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.txt'):
                file_path = os.path.join(root, filename)
                # 각 파일을 처리
                print(f"Processing file: {filename}")
                process_section_from_file(file_path, property_name, definition, preferred_model)

# 폴더 경로 설정 
folder_path = "경로 설정"

# 모든 파일 처리 함수 호출
process_all_files_in_folder(folder_path, property_name, definition)