In [1]:
import pandas as pd

In [2]:
def process_files(gpt_sample_path, hf_sample_path, output_path):
    # Load the files
    gpt_df = pd.read_csv(gpt_sample_path)
    hf_df = pd.read_csv(hf_sample_path)

    # Filter out "Missing description" and "Other" in "1963 Item" column
    gpt_df_filtered = gpt_df[~gpt_df['1963 Item'].str.contains("Missing description|Other", case=False, na=False)]
    hf_df_filtered = hf_df[~hf_df['1963 Item'].str.contains("Missing description|Other", case=False, na=False)]

    # Sort the filtered dataframes
    gpt_df_filtered_sorted = gpt_df_filtered.sort_values(by="1963 Item")
    hf_df_filtered_sorted = hf_df_filtered.sort_values(by="1963 Item")

    # Merge the filtered and sorted dataframes on "1963 Item"
    merged_filtered_df = pd.merge(gpt_df_filtered_sorted, hf_df_filtered_sorted, on="1963 Item", suffixes=('_GPT', '_HF'))

    # Determine the higher confidence score for each item
    merged_filtered_df['Higher Confidence Source'] = merged_filtered_df.apply(
        lambda x: 'GPT' if x['Confidence Score_GPT'] > x['Confidence Score_HF'] else 'HF', axis=1
    )

    # Include a column for the final confidence level
    merged_filtered_df['Final Confidence Level'] = merged_filtered_df.apply(
        lambda x: x['Confidence Score_GPT'] if x['Higher Confidence Source'] == 'GPT' else x['Confidence Score_HF'], axis=1
    )

    # Select data based on the higher confidence score
    final_filtered_df = merged_filtered_df[['1963 Item', 'Predicted HS Code_GPT', 'Predicted HS Code_HF', 'Higher Confidence Source', 'Final Confidence Level']].copy()
    final_filtered_df['Final Predicted HS Code'] = merged_filtered_df.apply(
        lambda x: x['Predicted HS Code_GPT'] if x['Higher Confidence Source'] == 'GPT' else x['Predicted HS Code_HF'], axis=1
    )

    # Drop the separate HS Code columns
    final_filtered_df = final_filtered_df.drop(['Predicted HS Code_GPT', 'Predicted HS Code_HF'], axis=1)

    # Save to the specified output path
    final_filtered_df.to_csv(output_path, index=False)

    return output_path

In [None]:
tariff_db_2022 = pd.read_excel('/home/samirk08/UROP_SPRING_2024/UROP IAP 2024/Original Databases/tariff database_202305.xlsx')


In [3]:
def get_2022_description(hs_code):
    # Find the description for the given HS code in the 2022 tariff database
    matched_row = tariff_db_2022[tariff_db_2022['HS Code'] == hs_code]
    if not matched_row.empty:
        # Return the description if found
        return matched_row.iloc[0]['Description']
    else:
        # Return None or a default description if not found
        return None

# Function to process files and map to 2022 descriptions
def process_files_with_description(gpt_sample_path, hf_sample_path, tariff_db_path, output_path):
    # Call the existing function to process the files
    final_output_path = process_files(gpt_sample_path, hf_sample_path, output_path)
    
    # Load the result
    final_df = pd.read_csv(final_output_path)

    # Load the 2022 tariff database
    tariff_db_2022 = pd.read_excel(tariff_db_path)

    # Map each HS code to its 2022 description
    final_df['2022 Description'] = final_df['Final Predicted HS Code'].apply(get_2022_description)

    # Save the updated DataFrame to a new CSV file
    final_df.to_csv(final_output_path, index=False)
    return final_output_path

In [4]:
# Paths to your files
gpt_sample_path = '/home/samirk08/UROP_SPRING_2024/1963/1963_GPT_Samlpe.csv'
hf_sample_path = '/home/samirk08/UROP_SPRING_2024/1963/HF_1963_Sample.csv'
tariff_db_path = '/home/samirk08/UROP_SPRING_2024/UROP IAP 2024/Original Databases/tariff database_202305.xlsx'
output_path = '1963_SAMPLE_HYBRID.csv'

In [5]:
final_output_path = process_files_with_description(gpt_sample_path, hf_sample_path, tariff_db_path, output_path)
print(f"Output with descriptions saved to: {final_output_path}")


NameError: name 'tariff_db_2022' is not defined