In [1]:
import pandas as pd
import numpy as np
import random
import warnings
from datetime import datetime, timedelta
from fuzzywuzzy import fuzz, process


warnings.filterwarnings('ignore')
rseed = 151

In [2]:
df = pd.read_csv(r'G:/My Drive/Spatial data/Superset/geom_desa_garut_20240610.csv')
adm = pd.read_excel(r'G:/My Drive/Spatial data/Nama Lokasi/Garut - Daerah Administratif.xlsx')

adm = adm[['Desa/Village', 'Puskesmas', 'Kecamatan/District']].dropna()
adm = adm[adm['Puskesmas']!='Mekarmukti']

In [3]:
def fuzzy_match(row, choices, scorer=fuzz.ratio, cutoff=0):
    """
    Perform fuzzy matching on the row and return the best match and its score.
    
    :param row: The row to match
    :param choices: List of choices to match against
    :param scorer: Scoring function from fuzzywuzzy
    :param cutoff: Minimum score to be considered a match
    :return: Best match and its score
    """
    match, score = process.extractOne(row, choices, scorer=scorer, score_cutoff=cutoff)
    return pd.Series([match, score])

In [4]:
def random_sum_array_with_range(xsum, xlen, xmin, xmax, seed):
    # Convert inputs to integers
    xsum = int(xsum)
    xlen = int(xlen)
    xmin = int(xmin)
    xmax = int(xmax)
    
    # Set the seed for reproducibility of random numbers
    random.seed(seed)

    # Check if the desired sum can be achieved given the constraints
    if xsum > xlen * xmax:
        raise ValueError("Error: xsum is greater than xlen * xmax")
    
    if xsum < xlen * xmin:
        raise ValueError("Error: xsum is less than xlen * xmin")

    # If the length of the array is non-positive, return an empty list
    if xlen <= 0:
        return []

    # If only one number is required, return a random number within the specified range
    if xlen == 1:
        return [random.randint(xmin, xmax)]

    # Generate an initial list of random numbers within the specified range
    numbers = [random.randint(xmin, xmax) for _ in range(xlen)]

    # Adjust the list until the sum equals xsum
    while sum(numbers) != xsum:
        diff = xsum - sum(numbers)  # Calculate the difference from the desired sum

        if diff > 0:
            # If the sum is too small, increment the smallest elements
            smallest_indices = sorted(range(len(numbers)), key=lambda i: numbers[i])[:diff]
            for i in smallest_indices:
                numbers[i] += 1
                # Ensure we don't exceed the maximum value
                if numbers[i] > xmax:
                    numbers[i] = xmax
        else:
            # If the sum is too large, decrement the elements
            for i in range(xlen):
                numbers[i] -= 1
                # Ensure we don't go below the minimum value
                if numbers[i] < xmin:
                    numbers[i] = xmin

    # Once the sum is correct, shuffle the list for randomness
    if sum(numbers) == xsum:
        random.shuffle(numbers)

    # Return the adjusted list
    return numbers

In [5]:
def random_daily_array(df, value_col, start_date, end_date, seed):
    # Create an empty DataFrame to store the final output
    df_out = pd.DataFrame()
    
    # Define the name for the total column
    total_column_name = value_col + "_Sum"
    
    # Define the date format
    date_format = "%Y-%m-%d"
    
    # Calculate the number of days between start_date and end_date
    total_days = (datetime.strptime(end_date, date_format) - datetime.strptime(start_date, date_format)).days
    
    # Iterate over each row in the input DataFrame
    for i in df.index:
        # Create an empty DataFrame to store daily values for the current row
        df_i = pd.DataFrame()
        
        # Calculate the daily average
        daily_avg = df[value_col][i] // total_days
        
        # Generate daily cases based on the value of daily_avg
        if daily_avg >= 1:
            # If daily_avg is 1 or more, generate daily cases within the range [0, daily_avg * 2]
            daily_cases = random_sum_array_with_range(df[value_col][i], total_days, 0, daily_avg * 2, seed * i)
        elif daily_avg < 1 and df[value_col][i] > 0:
            # If daily_avg is less than 1 and the value is greater than 0, generate daily cases within the range [0, 1]
            daily_cases = random_sum_array_with_range(df[value_col][i], total_days, 0, 1, seed * i)
        elif df[value_col][i] == 0:
            # If the value is 0, create an array of zeros
            daily_cases = [0] * total_days
        else:
            # Raise an error if the value is negative
            raise ValueError("Value is negative!")
        
        # For each day in the total_days range, create a new row with the daily values
        for d in range(total_days):
            df_d = df.loc[[i]]  # Copy the original row
            df_d.insert(0, 'Date', datetime.strptime(start_date, date_format) + timedelta(days=d))  # Add the Date column
            df_d[value_col] = daily_cases[d]  # Update the value column with the daily value
            df_i = pd.concat([df_i, df_d], ignore_index=True)  # Append the new row to df_i
        
        # Append df_i to the final output DataFrame
        df_out = pd.concat([df_out, df_i], ignore_index=True)
    
    # Add a column with the total sum for each date
    df_out[total_column_name] = df_out.groupby('Date')[value_col].transform('sum')
    
    # Return the final DataFrame
    return df_out

In [6]:
# Merge Dataframes by text-matching
adm['id'] = adm['Kecamatan/District'] + ' ' + adm['Desa/Village']
df['id'] = df['NAMOBJ'].str.replace('_', ' ')

adm[['best_match', 'score']] = adm['id'].apply(fuzzy_match,choices=df['id'].unique())
adm = adm.sort_values('score').drop_duplicates('best_match', keep='last')

df  = df.merge(adm[['best_match', 'Puskesmas']], how='left', left_on='id', right_on='best_match')

In [7]:
# Example A
df_a = df.copy()
df_a_cols = ['NAMOBJ', 'Puskesmas', 'WADMKD', 'WADMKC', 'WADMKK',
            'Antenatal_care', 'Basic_immunizations','Adolescent_screening',
            'Elderly_NCD_screening', 'geometry']

df_a['Antenatal_care'] = np.nan
df_a['Basic_immunizations'] = np.nan
df_a['Adolescent_screening'] = np.nan
df_a['Elderly_NCD_screening'] = np.nan
df_a = df_a[df_a_cols]

df_a1 = df_a[df_a['Puskesmas'].isnull()]
df_a2 = df_a[~df_a['Puskesmas'].isnull()]

df_a2['Antenatal_care'] = random_sum_array_with_range(69 * len(df_a2), len(df_a2), 50, 100, rseed)
df_a2['Basic_immunizations'] = random_sum_array_with_range(87 * len(df_a2), len(df_a2), 75, 100, rseed)
df_a2['Adolescent_screening'] = random_sum_array_with_range(64 * len(df_a2), len(df_a2), 33, 81, rseed)
df_a2['Elderly_NCD_screening'] = random_sum_array_with_range(46 * len(df_a2), len(df_a2), 25, 72, rseed)

df_a = pd.concat([df_a1, df_a2]).sort_index()

In [8]:
# Examples B
df_b = df.copy()
df_b['Value'] = random_sum_array_with_range(30000, len(df_b), 0, 100, rseed)
df_b = random_daily_array(df_b, 'Value', '2024-02-01', '2024-03-01', rseed)