In [None]:
import pandas as pd

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths to your files in Google Drive
data_file_path = '/content/drive/MyDrive/data_obat/dataset/drug/update_drug_1219_14.xlsx'
condition_mapping_path = '/content/drive/MyDrive/data_obat/dataset/disease/condition_3.xlsx'
output_file_path = '/content/drive/MyDrive/data_obat/dataset/drug/update_drug_1219_15.xlsx'

# Load your Excel files
df = pd.read_excel(data_file_path)
condition_df = pd.read_excel(condition_mapping_path)

# Create the condition mapping dictionary from the DataFrame
condition_mapping = condition_df.set_index('Disease Name')['Mapped Disease Names from Symptoms'].to_dict()

# Convert each keyword string to a list of keywords for easier processing
for condition, keywords in condition_mapping.items():
    condition_mapping[condition] = [keyword.strip().lower() for keyword in keywords.split(',')]

# Initialize a counter for the number of updated rows
updated_rows_count = 0

# Define a function to check conditions and update disease
def add_disease(row):
    global updated_rows_count
    # Get the current disease value and handle NaN values
    disease_value = str(row['disease']).lower() if pd.notnull(row['disease']) else ""

    # Split the disease value into individual words or phrases for exact matching
    disease_words = set(disease_value.split(', '))

    # Iterate through each condition and its associated keywords
    for condition, keywords in condition_mapping.items():
        # Check if any of the keywords match exactly with the words in disease
        if any(keyword in disease_words for keyword in keywords):
            # Add the condition if it is not already present in the 'disease' column
            if condition not in disease_value:
                # Combine the condition with the current disease value, handling NaN or empty cases
                new_disease = f"{disease_value}, {condition}".strip(", ")
                # Increment the counter for updated rows
                updated_rows_count += 1
                # Print debug information
                print(f"Row {row.name}: Added '{condition}' to disease. Updated disease: '{new_disease}'")
                return new_disease
    return row['disease']

# Apply the function to each row and keep the index for better debugging
df = df.reset_index()  # Reset index to maintain row numbers
df['disease'] = df.apply(add_disease, axis=1)

# Save the modified DataFrame back to an Excel file in Google Drive
df.to_excel(output_file_path, index=False)

# Print the total number of updated rows
print(f"Total number of rows updated: {updated_rows_count}")
print(f"Output saved to {output_file_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Row 2: Added 'Diarrhea' to disease. Updated disease: 'diarrhea, gastroenteritis, hypermagnesemia, ankylosing spondylitis, kidney infection, dysentry, Diarrhea'
Row 12: Added 'urinary tract infection' to disease. Updated disease: 'muscle pain (myalgia), sprain, bruises, fever, parotitis, dysentery, typhoid fever, lymphadenitis, tonsillopharyngitis acute, brucellosis, diarrhea, pneumonia, hepatitis a, acute upper, immunization, varicella, singapore flu, hypermagnesemia, influenza, brucellosis, diphtheria, tuberculosis, hepatitis a, acute kidney failure, immunization, rhinosinusitis, rhinosinusitis acute, bacterial, urinary tract infection'
Row 13: Added 'urinary tract infection' to disease. Updated disease: 'muscle pain (myalgia), sprain, bruises, fever, parotitis, dysentery, typhoid fever, lymphadenitis, tonsillopharyngitis acute, brucellosis, diarrhea, pneumo

In [None]:
import pandas as pd

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths to your files in Google Drive
data_file_path = '/content/drive/MyDrive/data_obat/dataset/drugs_20241013_updated.xlsx'
condition_mapping_path = '/content/drive/MyDrive/data_obat/dataset/condition_mapping.xlsx'
output_file_path = '/content/drive/MyDrive/data_obat/dataset/drugs_20241013_updated2.xlsx'

# Load your Excel files
df = pd.read_excel(data_file_path)
condition_df = pd.read_excel(condition_mapping_path)

# Create the condition mapping dictionary from the DataFrame
condition_mapping = condition_df.set_index('condition')['keywords'].to_dict()

# Convert each keyword string to a list of keywords for easier processing
for condition, keywords in condition_mapping.items():
    condition_mapping[condition] = [keyword.strip().lower() for keyword in keywords.split(',')]

# Initialize a counter for the number of updated rows
updated_rows_count = 0

# Define a function to check conditions and update disease
def add_disease(row):
    global updated_rows_count

    # Check the disease value
    disease_value = row['disease']

    # Print the original disease value and its type for debugging
    print(f"Original disease value (Row {row.name}): {disease_value} (Type: {type(disease_value)})")

    # Handle NaN or non-string values
    if pd.isna(disease_value) or not isinstance(disease_value, str):
        disease_value = ""  # Treat NaN and non-string as empty string
    else:
        disease_value = disease_value.strip().lower()

    # Only split if the disease_value is a non-empty string
    disease_words = set(disease_value.split(', ')) if disease_value else set()

    # A flag to track if any updates are made
    updated = False

    # Iterate through each condition and its associated keywords
    for condition, keywords in condition_mapping.items():
        # Check if any of the keywords match exactly with the words in disease
        if any(keyword in disease_words for keyword in keywords):
            # Add the condition if it is not already present in the 'disease' column
            if condition not in disease_value:
                # Combine the condition with the current disease value, handling NaN or empty cases
                new_disease = f"{disease_value}, {condition}".strip(", ")
                updated_rows_count += 1
                updated = True
                # Print debug information
                print(f"Row {row.name}: Added '{condition}' to disease. Updated disease: '{new_disease}'")
                # Update the row's disease value
                disease_value = new_disease

    return disease_value  # Return the updated or original disease value

# Apply the function to each row and keep the index for better debugging
df = df.reset_index()  # Reset index to maintain row numbers
df['disease'] = df.apply(add_disease, axis=1)

# Save the modified DataFrame back to an Excel file in Google Drive
df.to_excel(output_file_path, index=False)

# Print the total number of updated rows
print(f"Total number of rows updated: {updated_rows_count}")
print(f"Output saved to {output_file_path}")

