In [4]:
import pandas as pd
from sklearn.metrics import f1_score

In [5]:
import pandas as pd

def load_and_prepare_data(excel_path, csv_path):
    # Load the Excel and CSV files
    df_excel = pd.read_excel(excel_path)
    df_csv = pd.read_csv(csv_path)
    # Rename column in the CSV DataFrame
    df_csv.rename(columns={'1963 Item': 'Article'}, inplace=True)
    return df_excel, df_csv

def filter_sort_and_remove_duplicates(df_excel, df_csv):
    # Filter out unwanted rows and sort DataFrames
    df_excel_filtered = df_excel[~df_excel['Article'].isin(['Missing description', 'Other'])]
    df_excel_sorted = df_excel_filtered.sort_values(by='Article')
    df_csv_sorted = df_csv.sort_values(by='Article')
    # Merge and remove duplicates based on 'ProductDescription'
    merged_df = pd.merge(df_excel_sorted, df_csv_sorted, on='Article', how='inner')
    merged_df_unique = merged_df.drop_duplicates(subset=['Article'], keep='first')
    return merged_df_unique

def calculate_f1_score(row, digit):
    try:
        actual_code = str(row['HTS Code']).zfill(8)[:digit]
        predicted_code = str(row['Final Predicted HS Code']).zfill(8)[:digit]
        return 1 if actual_code == predicted_code else 0
    except (ValueError, TypeError):
        return 0

def apply_f1_scores_and_finalize(merged_df_unique):
    # Apply F1 score calculation
    for digits in [2, 4, 6]:
        merged_df_unique[f'F1_{digits}digit'] = merged_df_unique.apply(calculate_f1_score, axis=1, args=(digits,))
    # Prepare final output
    final_columns = ['Article', 'HTS Code', 'Final Predicted HS Code', 'Final Confidence Level',
                     'F1_2digit', 'F1_4digit', 'F1_6digit']
    final_output_df = merged_df_unique[final_columns]
    return final_output_df

In [6]:
if __name__ == '__main__':
    excel_path = '/home/samirk08/UROP_SPRING_2024/1963/updated_1963Sample_with_hts.xlsx'
    csv_path = '/home/samirk08/UROP_SPRING_2024/1963/1963_SAMPLE_HYBRID.csv'
    
    df_excel, df_csv = load_and_prepare_data(excel_path, csv_path)
    merged_df_unique = filter_sort_and_remove_duplicates(df_excel, df_csv)
    final_output_df = apply_f1_scores_and_finalize(merged_df_unique)
    
    # Save to CSV
    output_csv_path = '1963_HS_Comparison.csv'
    final_output_df.to_csv(output_csv_path, index=False)
    
    print(f'Final output saved to {output_csv_path}')

Final output saved to 1990_HS_Comparison.csv
