In [None]:
import pandas as pd
from google.colab import drive

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Load Excel A and Excel B (replace with your actual file paths)
excel_a_path = '/content/drive/MyDrive/data_obat/dataset/disease_20241201.xlsx'
excel_b_path = '/content/drive/MyDrive/data_obat/dataset/drug/update_drug_1219_15.xlsx'

excel_a = pd.read_excel(excel_a_path)
excel_b = pd.read_excel(excel_b_path)

# Step 3: Create a dictionary mapping diseases to ids from Excel A
disease_to_id = dict(zip(excel_a['disease_name'].str.lower(), excel_a['id']))

# Step 4: Define the function to map diseases in Excel B to their corresponding ids
def map_disease_to_id(disease_str):
    if isinstance(disease_str, str):  # Check if the input is a string
        diseases = [d.strip().lower() for d in disease_str.split(',')]
        id_list = [disease_to_id.get(d, d) for d in diseases]
        return ', '.join(id_list)
    return disease_str  # Return the original value if it's not a string

# Initialize the counter for updated rows
updated_rows_count = 0

# Step 5: Apply the function to the 'disease' column of Excel B, and update the counter
def update_and_count(row):
    original_disease = row['disease']
    updated_disease = map_disease_to_id(original_disease)
    if original_disease != updated_disease:
        global updated_rows_count
        updated_rows_count += 1
    return updated_disease

excel_b['disease'] = excel_b.apply(update_and_count, axis=1)

# Step 6: Save the updated Excel B back to Google Drive
updated_excel_b_path = '/content/drive/MyDrive/data_obat/dataset/drug/uuid_update_drug_1219.xlsx'
excel_b.to_excel(updated_excel_b_path, index=False)

# Output the number of updated rows
print(f"Number of rows updated: {updated_rows_count}")
print(f"Updated Excel B saved to: {updated_excel_b_path}")


Mounted at /content/drive
Number of rows updated: 18125
Updated Excel B saved to: /content/drive/MyDrive/data_obat/dataset/drug/uuid_update_drug_1219.xlsx


In [None]:
import pandas as pd
from google.colab import drive

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Load Excel A and Excel B (replace with your actual file paths)
excel_a_path = '/content/drive/MyDrive/data_obat/dataset/disease_20241201.xlsx'
excel_b_path = '/content/drive/MyDrive/data_obat/dataset/drug/update_drug_1215_2.xlsx'

excel_a = pd.read_excel(excel_a_path)
excel_b = pd.read_excel(excel_b_path)

# Step 3: Create a set of valid UUIDs from Excel A for validation
valid_uuids = set(excel_a['id'])

# Initialize the counter for failed rows
failed_rows_count = 0

# Step 4: Function to check if conversion failed or if the cell is empty
def is_conversion_failed(disease_str):
    global failed_rows_count
    if pd.isna(disease_str) or not disease_str.strip():
        # Increment the counter if the disease string is NaN or empty
        failed_rows_count += 1
        return True
    if isinstance(disease_str, str):
        uuid_list = [uuid.strip() for uuid in disease_str.split(',')]
        # Check if any UUID in the list is not in the valid set
        if any(uuid not in valid_uuids for uuid in uuid_list):
            failed_rows_count += 1
            return True
    return False

# Step 5: Filter rows where conversion failed
failed_conversions = excel_b[excel_b['disease'].apply(is_conversion_failed)]

# Step 6: Save the failed conversions to a new Excel file
failed_conversions_path = '/content/drive/MyDrive/data_obat/dataset/drug/failed_conversions.xlsx'
failed_conversions.to_excel(failed_conversions_path, index=False)

print(f"Failed conversions saved to: {failed_conversions_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Failed conversions saved to: /content/drive/MyDrive/data_obat/dataset/drug/failed_conversions.xlsx
