In [3]:
import pandas as pd
import os
import re
import datetime

In [4]:
def getDate(filename):
    match_str = re.search(r'\d{4}-\d{2}-\d{2}',filename)
    try:
        return datetime.datetime.strptime(match_str.group(), '%Y-%m-%d').date()
    except ValueError:
        return None

In [5]:
def cleanColumn(name):
    name = name.upper()
    clean_name = re.sub(r'[^a-zA-Z0-9_]', '', name)
    clean_name = clean_name.replace('\n', ' ')
    if clean_name == 'SHELTERNFI':
        clean_name = 'SHELTER'
    if clean_name == 'PROTECTIONCP':
        clean_name = 'PC_CP'
    if clean_name == 'PROTECTIONGBV':
        clean_name = 'PC_GBV'
    if clean_name == 'PROTECTIONMA':
        clean_name = 'PC_MA'
    if clean_name == 'TOTALREACHED':
        clean_name = 'PEOPLEREACHED'
    return clean_name

In [6]:
#using csv and xlsx
csv_dir = 'ukraine_data_excel'


In [7]:
#using xlsx
sheet1_df = []
sheet2_df = []
sheet3_df = []

In [8]:
#using xlsx
for filename in os.listdir(csv_dir):
  if filename.endswith('.xlsx'):
    file_path = os.path.join(csv_dir,filename)
    excel_df = pd.ExcelFile(file_path)
    files_date = getDate(filename)
    for sheet in excel_df.sheet_names:
      df = pd.read_excel(excel_df,sheet_name=sheet)
      df.columns = [cleanColumn(col) for col in df.columns]
      df['DATE'] = files_date
      if sheet == 'Num_of_Orgs_by_Oblast':
        sheet1_df.append(df)
      elif sheet == 'People_Reached_by_Oblast':
        sheet2_df.append(df)
      elif sheet == 'UDE_Inputs':
        sheet3_df.append(df)

In [9]:
combined_sheet1 = pd.concat(sheet1_df, keys=[str(df['DATE'].iloc[0]) for df in sheet1_df])
res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet1 = pd.merge(res_sheet1, combined_sheet1[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet1 = combined_sheet1.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [10]:
combined_sheet2 = pd.concat(sheet2_df, keys=[str(df['DATE'].iloc[0]) for df in sheet2_df])
res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet2 = pd.merge(res_sheet2, combined_sheet2[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet2 = combined_sheet2.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [11]:
combined_sheet3 = pd.concat(sheet3_df, keys=[str(df['DATE'].iloc[0]) for df in sheet3_df])
res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()
merged_sheet3 = pd.merge(res_sheet3, combined_sheet3[['ADMIN1_ID', 'DATE', 'OBLAST']], on=['ADMIN1_ID', 'DATE'], how='left')

  res_sheet3 = combined_sheet3.groupby(['ADMIN1_ID', 'DATE']).sum().reset_index()


In [12]:
print(combined_sheet1.shape)
print(res_sheet1.shape)
print(merged_sheet1.shape)

(625, 21)
(600, 19)
(625, 20)


In [13]:
#Num_of_Orgs_by_Oblast
merged_sheet1.head(14)


Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,PROTECTIONTOTAL,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST
0,UA05,2022-04-28,2.0,1.0,1.0,5.0,17,16,10.0,0.0,16.0,2.0,2.0,2.0,11.0,6,7,42,0.0,Vinnytska
1,UA05,2022-05-06,2.0,1.0,1.0,5.0,20,17,10.0,0.0,0.0,2.0,3.0,2.0,11.0,6,8,45,0.0,Vinnytska
2,UA05,2022-05-12,2.0,1.0,1.0,6.0,20,18,10.0,0.0,19.0,4.0,4.0,2.0,14.0,6,9,49,0.0,Vinnytska
3,UA05,2022-05-19,2.0,1.0,1.0,6.0,20,20,10.0,0.0,21.0,6.0,4.0,2.0,14.0,6,9,50,0.0,Vinnytska
4,UA05,2022-05-26,0.0,1.0,1.0,6.0,20,20,9.0,0.0,21.0,6.0,4.0,4.0,12.0,7,9,48,0.0,Vinnytska
5,UA05,2022-06-02,0.0,1.0,1.0,6.0,23,20,11.0,0.0,23.0,8.0,4.0,3.0,0.0,7,9,58,0.0,Vinnytska
6,UA05,2022-06-09,1.0,1.0,1.0,6.0,23,20,11.0,0.0,23.0,8.0,4.0,3.0,0.0,8,9,58,0.0,Vinnytska
7,UA05,2022-06-16,1.0,1.0,1.0,6.0,24,20,13.0,0.0,23.0,10.0,5.0,3.0,0.0,9,11,59,0.0,Vinnytska
8,UA05,2022-06-23,1.0,1.0,1.0,6.0,24,20,13.0,0.0,23.0,10.0,5.0,3.0,0.0,9,11,59,0.0,Vinnytska
9,UA05,2022-06-30,1.0,1.0,1.0,6.0,22,20,12.0,0.0,23.0,10.0,5.0,3.0,0.0,10,13,56,0.0,Vinnytska


In [14]:
merged_sheet1['OBLAST'].value_counts()

Vinnytska           25
Odeska              25
Chernihivska        25
Chernivetska        25
Cherkaska           25
Khmelnytska         25
Khersonska          25
Kharkivska          25
Ternopilska         25
Sumska              25
Rivnenska           25
Poltavska           25
Mykolaivska         25
Volynska            25
Lvivska             25
Luhanska            25
Kirovohradska       25
Kyivska             25
Ivano-Frankivska    25
Zaporizka           25
Zakarpatska         25
Zhytomyrska         25
Donetska            25
Dnipropetrovska     25
Kyiv                25
Name: OBLAST, dtype: int64

In [15]:
sum(merged_sheet3['PEOPLEREACHED'].value_counts())

625

In [16]:
merged_sheet1['PEOPLE_REACHED'] = merged_sheet3['PEOPLEREACHED']

In [17]:
merged_sheet1.head()

Unnamed: 0,ADMIN1_ID,DATE,CCCM,CCS,ETC,EDUCATION,FSL,HEALTH,MPC,NUTRITION,...,PC_CP,PC_GBV,PC_MA,PC_PC,SHELTER,WASH,TOTAL,LOGISTICS,OBLAST,PEOPLE_REACHED
0,UA05,2022-04-28,2.0,1.0,1.0,5.0,17,16,10.0,0.0,...,2.0,2.0,2.0,11.0,6,7,42,0.0,Vinnytska,91200
1,UA05,2022-05-06,2.0,1.0,1.0,5.0,20,17,10.0,0.0,...,2.0,3.0,2.0,11.0,6,8,45,0.0,Vinnytska,113000
2,UA05,2022-05-12,2.0,1.0,1.0,6.0,20,18,10.0,0.0,...,4.0,4.0,2.0,14.0,6,9,49,0.0,Vinnytska,138800
3,UA05,2022-05-19,2.0,1.0,1.0,6.0,20,20,10.0,0.0,...,6.0,4.0,2.0,14.0,6,9,50,0.0,Vinnytska,140600
4,UA05,2022-05-26,0.0,1.0,1.0,6.0,20,20,9.0,0.0,...,6.0,4.0,4.0,12.0,7,9,48,0.0,Vinnytska,216600


In [18]:
merged_sheet1.shape

(625, 21)

In [19]:
#People_Reached_by_Oblast
merged_sheet2.head(10)

Unnamed: 0,ADMIN1_ID,DATE,FSL,GENERALPROTECTION,CP,MINEACTION,PROTECTIONTOTAL,HEALTH,MPC,SHELTER,WASH,PEOPLEREACHED,OBLAST
0,UA05,2022-04-28,7900,11200.0,100.0,500.0,11900,4600,91200.0,15800,2300.0,91200,Vinnytska
1,UA05,2022-05-06,14300,9600.0,100.0,500.0,14300,4600,113000.0,16200,2300.0,113000,Vinnytska
2,UA05,2022-05-12,15500,11400.0,200.0,500.0,16300,9900,138800.0,17500,3700.0,138800,Vinnytska
3,UA05,2022-05-19,17500,0.0,500.0,500.0,20300,23200,140600.0,17500,3700.0,140600,Vinnytska
4,UA05,2022-05-26,17600,0.0,4400.0,700.0,25600,23300,216600.0,21700,3800.0,216600,Vinnytska
5,UA05,2022-06-02,19000,0.0,21500.0,700.0,48000,23300,236000.0,33300,3800.0,236000,Vinnytska
6,UA05,2022-06-09,19000,0.0,21500.0,700.0,21000,23300,240100.0,40500,23800.0,240100,Vinnytska
7,UA05,2022-07-07,39200,0.0,46500.0,11800.0,94900,27500,214600.0,59400,42700.0,214600,Vinnytska
8,UA05,2022-07-14,39200,0.0,35700.0,11800.0,75700,27500,216000.0,60700,42700.0,216000,Vinnytska
9,UA05,2022-07-21,39200,0.0,35700.0,99800.0,174700,39000,217200.0,62600,42700.0,217200,Vinnytska


In [20]:
#UDE_Inputs
merged_sheet3.head()

Unnamed: 0,ADMIN1_ID,DATE,PEOPLEREACHED,NUMBEROFORGANISATIONS,OBLAST
0,UA05,2022-04-28,91200,42,Vinnytska
1,UA05,2022-05-06,113000,45,Vinnytska
2,UA05,2022-05-12,138800,49,Vinnytska
3,UA05,2022-05-19,140600,50,Vinnytska
4,UA05,2022-05-26,216600,48,Vinnytska
