In [2]:
import pandas as pd
import numpy as np
import os
import re
import datetime

In [3]:
def getDate(filename):
    match_str = re.search(r'\d{4}-\d{2}-\d{2}',filename)
    try:
        return datetime.datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    except ValueError:
        return None

In [4]:
def cleanColumn(name):
    name = name.upper()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    clean_name = clean_name.replace('\n', ' ')
    if clean_name == 'SHELTERNFI':
        clean_name = 'SHELTER'
    if clean_name == 'PROTECTIONCP':
        clean_name = 'PC_CP'
    if clean_name == 'PROTECTIONGBV':
        clean_name = 'PC_GBV'
    if clean_name == 'PROTECTIONMA':
        clean_name = 'PC_MA'
    if clean_name == 'TOTALREACHED':
        clean_name = 'PEOPLEREACHED'
    return clean_name

In [5]:
#using csv and xlsx
csv_dir = 'ukraine_data_excel'


In [6]:
#using xlsx
sheet1_df = []
sheet2_df = []
sheet3_df = []

In [7]:
#using xlsx
for filename in os.listdir(csv_dir):
  if filename.endswith('.xlsx'):
    file_path = os.path.join(csv_dir,filename)
    excel_df = pd.ExcelFile(file_path)
    files_date = getDate(filename)
    for sheet in excel_df.sheet_names:
      df = pd.read_excel(excel_df,sheet_name=sheet)
      df.columns = [cleanColumn(col) for col in df.columns]
      df['DATE'] = files_date
      if sheet == 'Num_of_Orgs_by_Oblast':
        sheet1_df.append(df)
      elif sheet == 'People_Reached_by_Oblast':
        sheet2_df.append(df)
      elif sheet == 'UDE_Inputs':
        sheet3_df.append(df)

In [8]:
combined_sheet1 = pd.concat(sheet1_df, keys=[str(df['DATE'].iloc[0]) for df in sheet1_df])
res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet1 = pd.merge(res_sheet1, combined_sheet1[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [9]:
combined_sheet2 = pd.concat(sheet2_df, keys=[str(df['DATE'].iloc[0]) for df in sheet2_df])
res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet2 = pd.merge(res_sheet2, combined_sheet2[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [10]:
combined_sheet3 = pd.concat(sheet3_df, keys=[str(df['DATE'].iloc[0]) for df in sheet3_df])
res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet3 = pd.merge(res_sheet3, combined_sheet3[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [11]:
merged_sheet1['PEOPLE_REACHED'] = merged_sheet3['PEOPLEREACHED']

In [12]:
#Num_of_Orgs_by_Oblast
merged_sheet1.head()


Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,...,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST,PEOPLE_REACHED
0,UA05,2022-04-28,2.0,1.0,1.0,5.0,17,16,10.0,0.0,...,2.0,2.0,2.0,11.0,6,7,42,0.0,Vinnytska,91200
1,UA05,2022-05-06,2.0,1.0,1.0,5.0,20,17,10.0,0.0,...,2.0,3.0,2.0,11.0,6,8,45,0.0,Vinnytska,113000
2,UA05,2022-05-12,2.0,1.0,1.0,6.0,20,18,10.0,0.0,...,4.0,4.0,2.0,14.0,6,9,49,0.0,Vinnytska,138800
3,UA05,2022-05-19,2.0,1.0,1.0,6.0,20,20,10.0,0.0,...,6.0,4.0,2.0,14.0,6,9,50,0.0,Vinnytska,140600
4,UA05,2022-05-26,0.0,1.0,1.0,6.0,20,20,9.0,0.0,...,6.0,4.0,4.0,12.0,7,9,48,0.0,Vinnytska,216600


In [13]:
#People_Reached_by_Oblast
merged_sheet2.head()

Unnamed: 0,ADMIN1_ID,DATE,FSL,GENERALPROTECTION,CP,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,SHELTER,WASH,PEOPLEREACHED,OBLAST
0,UA05,2022-04-28,7900,11200.0,100.0,500.0,11900,4600,91200.0,15800,2300.0,91200,Vinnytska
1,UA05,2022-05-06,14300,9600.0,100.0,500.0,14300,4600,113000.0,16200,2300.0,113000,Vinnytska
2,UA05,2022-05-12,15500,11400.0,200.0,500.0,16300,9900,138800.0,17500,3700.0,138800,Vinnytska
3,UA05,2022-05-19,17500,0.0,500.0,500.0,20300,23200,140600.0,17500,3700.0,140600,Vinnytska
4,UA05,2022-05-26,17600,0.0,4400.0,700.0,25600,23300,216600.0,21700,3800.0,216600,Vinnytska


In [14]:
#UDE_Inputs
merged_sheet3.head()

Unnamed: 0,ADMIN1_ID,DATE,PEOPLEREACHED,NUMBEROFORGANISATIONS,OBLAST
0,UA05,2022-04-28,91200,42,Vinnytska
1,UA05,2022-05-06,113000,45,Vinnytska
2,UA05,2022-05-12,138800,49,Vinnytska
3,UA05,2022-05-19,140600,50,Vinnytska
4,UA05,2022-05-26,216600,48,Vinnytska


In [15]:
copy_df = pd.DataFrame(np.nan, index=merged_sheet1.index,columns=merged_sheet1.columns)
copy_df['DATE'] = merged_sheet1['DATE']
copy_df['OBLAST'] = merged_sheet1['OBLAST']
copy_df.head()

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,...,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST,PEOPLE_REACHED
0,,2022-04-28,,,,,,,,,...,,,,,,,,,Vinnytska,
1,,2022-05-06,,,,,,,,,...,,,,,,,,,Vinnytska,
2,,2022-05-12,,,,,,,,,...,,,,,,,,,Vinnytska,
3,,2022-05-19,,,,,,,,,...,,,,,,,,,Vinnytska,
4,,2022-05-26,,,,,,,,,...,,,,,,,,,Vinnytska,


In [16]:
# Create an Excel writer object
excel_writer1 = pd.ExcelWriter('Data_ Round 33 - Ukraine 5W - 2022-11-10.xlsx', engine='xlsxwriter')
excel_writer2 = pd.ExcelWriter('Data_ Round 34 - Ukraine 5W - 2022-11-25.xlsx', engine='xlsxwriter')
# Write each DataFrame to a separate sheet in the Excel file
copy_df.to_excel(excel_writer1, sheet_name='Num_of_Orgs_by_Oblast', index=False)
copy_df.to_excel(excel_writer2, sheet_name='Num_of_Orgs_by_Oblast', index=False)

In [17]:
merged_sheet1.columns

Index(['ADMIN1_ID', 'DATE', 'CCCM', 'CCS', 'ETC', 'EDUCATION', 'FSL', 'HEALTH',
       'MPC', 'NUTRITION', 'PROTECTIONTOTAL', 'PC_CP', 'PC_GBV', 'PC_MA',
       'PC_PC', 'SHELTER', 'WASH', 'TOTAL', 'LOGISTICS', 'OBLAST',
       'PEOPLE_REACHED'],
      dtype='object')

In [18]:
merged_sheet1.shape

(625, 21)

In [19]:
merged_sheet3.shape

(625, 5)

In [20]:
merged_sheet1.DATE.value_counts()

2022-09-29    50
2022-04-28    25
2022-05-06    25
2022-10-13    25
2022-09-22    25
2022-09-15    25
2022-09-08    25
2022-08-25    25
2022-08-18    25
2022-08-11    25
2022-08-04    25
2022-07-28    25
2022-07-21    25
2022-07-14    25
2022-07-07    25
2022-06-30    25
2022-06-23    25
2022-06-16    25
2022-06-09    25
2022-06-02    25
2022-05-26    25
2022-05-19    25
2022-05-12    25
2022-10-27    25
Name: DATE, dtype: int64