In [4]:
import pandas as pd
import os
import re
import datetime

In [5]:
def getDate(filename):
    match_str = re.search(r'\d{4}-\d{2}-\d{2}',filename)
    try:
        return datetime.datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    except ValueError:
        return None

In [6]:
def cleanColumn(name):
    name = name.upper()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    clean_name = clean_name.replace('\n', ' ')
    if clean_name == 'SHELTERNFI':
        clean_name = 'SHELTER'
    if clean_name == 'PROTECTIONCP':
        clean_name = 'PC_CP'
    if clean_name == 'PROTECTIONGBV':
        clean_name = 'PC_GBV'
    if clean_name == 'PROTECTIONMA':
        clean_name = 'PC_MA'
    return clean_name

In [7]:
#using csv and xlsx
csv_dir = 'ukraine_data_excel'


In [8]:
#using xlsx
sheet1_df = []
sheet2_df = []
sheet3_df = []

In [16]:
#using xlsx
for filename in os.listdir(csv_dir):
  if filename.endswith('.xlsx'):
    file_path = os.path.join(csv_dir,filename)
    excel_df = pd.ExcelFile(file_path)
    files_date = getDate(filename)
    for sheet in excel_df.sheet_names:
      df = pd.read_excel(excel_df,sheet_name=sheet)
      df.columns = [cleanColumn(col) for col in df.columns]
      df['DATE'] = files_date
      if sheet == 'Num_of_Orgs_by_Oblast':
        sheet1_df.append(df)
      elif sheet == 'People_Reached_by_Oblast':
        sheet2_df.append(df)
      elif sheet == 'UDE_Inputs':
        sheet3_df.append(df)

In [17]:
combined_sheet1 = pd.concat(sheet1_df, keys=[str(df['DATE'].iloc[0]) for df in sheet1_df])
res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet1 = pd.merge(res_sheet1, combined_sheet1[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [18]:
combined_sheet2 = pd.concat(sheet2_df, keys=[str(df['DATE'].iloc[0]) for df in sheet2_df])
res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet2 = pd.merge(res_sheet2, combined_sheet2[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [19]:
combined_sheet3 = pd.concat(sheet3_df, keys=[str(df['DATE'].iloc[0]) for df in sheet3_df])
res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet3 = pd.merge(res_sheet3, combined_sheet3[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

In [20]:
#Num_of_Orgs_by_Oblast
merged_sheet1.head()


Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,LOGISTICS,...,SHELTER,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,WASH,TOTAL,CLUSTERLIST,PC_PC,OBLAST_y
0,UA05,2022-04-28,Vinnytska,2.0,1.0,1.0,5.0,17,16,0.0,...,6,16.0,2.0,2.0,2.0,7,42,"Camp Coordination & Camp Management,Coordinati...",11.0,Vinnytska
1,UA05,2022-05-06,Vinnytska,2.0,1.0,1.0,5.0,20,17,0.0,...,6,0.0,2.0,3.0,2.0,8,45,"Camp Coordination & Camp Management,Coordinati...",11.0,Vinnytska
2,UA05,2022-05-12,Vinnytska,2.0,1.0,1.0,6.0,20,18,0.0,...,6,19.0,4.0,4.0,2.0,9,49,"Camp Coordination & Camp Management,Coordinati...",14.0,Vinnytska
3,UA05,2022-05-19,Vinnytska,2.0,1.0,1.0,6.0,20,20,0.0,...,6,21.0,6.0,4.0,2.0,9,50,"Camp Coordination & Camp Management,Coordinati...",14.0,Vinnytska
4,UA05,2022-05-26,Vinnytska,0.0,1.0,1.0,6.0,20,20,0.0,...,7,21.0,6.0,4.0,4.0,9,48,"Coordination and Common Services,Emergency Tel...",12.0,Vinnytska


In [21]:
#People_Reached_by_Oblast
merged_sheet2.head()

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,EDUCATION,FSL,CP,GBV,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,NUTRITION,SHELTER,WASH,TOTALREACHED,ADM1_ID,PEOPLEREACHED,GENERALPROTECTION,OBLAST_y
0,UA05,2022-04-28,Vinnytska,2800.0,7900,100.0,100.0,500.0,11900,4600,91200.0,0,15800,2300.0,91200.0,0,0.0,11200.0,Vinnytska
1,UA05,2022-05-06,Vinnytska,2800.0,14300,100.0,4100.0,500.0,14300,4600,113000.0,0,16200,2300.0,113000.0,0,0.0,9600.0,Vinnytska
2,UA05,2022-05-12,Vinnytska,4900.0,15500,200.0,4100.0,500.0,16300,9900,138800.0,0,17500,3700.0,138800.0,0,0.0,11400.0,Vinnytska
3,UA05,2022-05-19,Vinnytska,9000.0,17500,500.0,6100.0,500.0,20300,23200,140600.0,0,17500,3700.0,140600.0,0,0.0,0.0,Vinnytska
4,UA05,2022-05-26,Vinnytska,11500.0,17600,4400.0,6200.0,700.0,25600,23300,216600.0,0,21700,3800.0,216600.0,0,0.0,0.0,Vinnytska


In [22]:
#UDE_Inputs
merged_sheet3.head()

Unnamed: 0,ADMIN1_ID,DATE,OBLAST_x,PEOPLEREACHED,NUMBEROFORGANISATIONS,CLUSTERSPRESENT,OBLAST_y
0,UA05,2022-04-28,Vinnytska,91200,42,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
1,UA05,2022-05-06,Vinnytska,113000,45,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
2,UA05,2022-05-12,Vinnytska,138800,49,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
3,UA05,2022-05-19,Vinnytska,140600,50,"Camp Coordination & Camp Management,Coordinati...",Vinnytska
4,UA05,2022-05-26,Vinnytska,216600,48,"Coordination and Common Services,Emergency Tel...",Vinnytska


In [23]:
merged_sheet1.columns

Index(['ADMIN1_ID', 'DATE', 'OBLAST_x', 'CCCM', 'CCS', 'ETC', 'EDUCATION',
       'FSL', 'HEALTH', 'LOGISTICS', 'MPC', 'NUTRITION', 'SHELTER',
       'PROTECTIONTOTAL', 'PC_CP', 'PC_GBV', 'PC_MA', 'WASH', 'TOTAL',
       'CLUSTERLIST', 'PC_PC', 'OBLAST_y'],
      dtype='object')

In [24]:
merged_sheet1.shape

(625, 22)