This notebook cleans DB23 (`ncdd_admin_database_25provinces__2023.xlsx` from Lok on 20231212), processing it into a ready-for-analysis format. 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from itables import show

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from utils.clean_DB23 import *

In [4]:
# paths
code_path = Path().absolute()
datafd_path = code_path.parent.parent / 'data'
intmd_outfd_path = code_path.parent.parent / 'output_intmd' / 'match_CF_DB23'

## Initial processing of DB23

### Read in DB23 and stack all sheets vertically into `db_df`

In [5]:
DB23_path = datafd_path / 'other' / 'ncdd_admin_database_25provinces__2023.xlsx'
DB23_df_dic = pd.read_excel(DB23_path, sheet_name=None, header=2)  
# all sheets, key=sheet name, skip first 2 rows, make row 3 header

In [6]:
sht_name_lst = list(DB23_df_dic.keys())
db_prov_df_lst = [DB23_df_dic[sht] for sht in sht_name_lst]

In [7]:
# prov1_df = db_prov_df_lst[0]
# prov1_columns = prov1_df.columns
# nrow = prov1_df.shape[0]
# for df in db_prov_df_lst[1:]:
#     assert (df.columns == prov1_columns).all()
#     print(df.shape)
#     nrow = nrow + df.shape[0]
# nrow

In [8]:
# Insert province info as first row
colnames = db_prov_df_lst[0].columns
for i, df in enumerate(db_prov_df_lst):
    sht_name = sht_name_lst[i]
    prov_code = int(sht_name[:2])
    prov_name = sht_name[4:]
    prov_row = dict(zip(
        colnames, ['Province', prov_code, np.nan, prov_name, np.nan, np.nan]
    ))
    prov_row_df = pd.DataFrame([prov_row])
    db_prov_df_lst[i] = pd.concat([prov_row_df, db_prov_df_lst[i]]).reset_index(drop=True)

In [9]:
db_prov_df_lst[0].head(2)

Unnamed: 0,Type,Code,Name (Khmer),Name (Latin),Reference,Official Note,Note (by Checker)
0,Province,1,,Banteay Meanchey,,,
1,ស្រុក,102,មង្គលបូរី,Mongkol Borei,ប្រកាសលេខ ៤៩៣ប្រ.ក,,


In [10]:
db_df = pd.concat(db_prov_df_lst)

In [11]:
db_df.to_csv(
    datafd_path / 'other' / 'ncdd_admin_database_25provinces__2023_stacked.csv'
)                        

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\tianc\\OneDrive\\Documents\\SIG\\DISES\\data\\other\\ncdd_admin_database_25provinces__2023_stacked.csv'

In [44]:
# Initializing lists to hold the transformed data
province_names = []
province_codes = []
district_names = []
district_codes = []
commune_names = []
commune_codes = []
village_rows = []

# Iterating through each row to transform the data
for index, row in db_df.iterrows():
    if row['Type'] == 'Province':
        current_province = row['Name (Latin)']
        current_province_code = row['Code']
    elif row['Type'] == 'ស្រុក':  # Assuming this means 'District'
        current_district = row['Name (Latin)']
        current_district_code = row['Code']
    elif row['Type'] == 'ឃុំ':  # Assuming this means 'Commune'
        current_commune = row['Name (Latin)']
        current_commune_code = row['Code']
    elif row['Type'] == 'ភូមិ':  # Assuming this means 'Village'
        # Append current province, district, and commune information
        province_names.append(current_province)
        province_codes.append(current_province_code)
        district_names.append(current_district)
        district_codes.append(current_district_code)
        commune_names.append(current_commune)
        commune_codes.append(current_commune_code)
        village_rows.append(row)
    else:
        print(f'row {index} has unexpected Type')

In [45]:
# Creating a new DataFrame from the lists
village_data = pd.DataFrame(village_rows)
village_data['Province'] = province_names
village_data['Province Code'] = province_codes
village_data['District'] = district_names
village_data['District Code'] = district_codes
village_data['Commune'] = commune_names
village_data['Commune Code'] = commune_codes

# Resetting the index of the new DataFrame
village_data.reset_index(drop=True, inplace=True)

# Display the first few rows of the transformed DataFrame
village_data.head()

Unnamed: 0,Type,Code,Name (Khmer),Name (Latin),Reference,Official Note,Note (by Checker),Province,Province Code,District,District Code,Commune,Commune Code
0,ភូមិ,1020101,អូរធំ,Ou Thum,ប្រកាសលេខ ៤៩៣ ប្រ.ក,,,Banteay Meanchey,1,Mongkol Borei,102,Banteay Neang,10201
1,ភូមិ,1020102,ភ្នំ,Phnum,ប្រកាសលេខ ៤៩៣ ប្រ.ក,,,Banteay Meanchey,1,Mongkol Borei,102,Banteay Neang,10201
2,ភូមិ,1020103,បន្ទាយនាង,Banteay Neang,ប្រកាសលេខ ៤៩៣ ប្រ.ក,,,Banteay Meanchey,1,Mongkol Borei,102,Banteay Neang,10201
3,ភូមិ,1020104,គោកព្នៅ,Kouk Pnov,ប្រកាសលេខ ៤៩៣ ប្រ.ក,,,Banteay Meanchey,1,Mongkol Borei,102,Banteay Neang,10201
4,ភូមិ,1020105,ត្រាង,Trang,ប្រកាសលេខ ៤៩៣ ប្រ.ក,,,Banteay Meanchey,1,Mongkol Borei,102,Banteay Neang,10201
