In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
from lxml import etree

In [42]:
def parse_large_xml(file_path, target_elements):
    """
    Parse a large XML file by iteratively processing elements
    
    Parameters:
    file_path (str): Path to the XML file
    target_elements (list): List of element tags to extract
    
    Returns:
    list: Extracted data
    """
    data = []
    
    # Create iterative parser
    context = etree.iterparse(file_path, events=('end',), tag=target_elements)
    
    for event, elem in context:
        # Extract data from the element
        item = {}
        for child in elem:
            item[child.tag] = child.text
        
        data.append(item)
        
        # Clear element to free memory
        elem.clear()
        
    return data

def convert_df_to_nested_dict(df):
    """
    Convert DataFrame to nested dictionary with proper chapter hierarchy
    """
    nested_dict = {}
    
    for _, row in df.iterrows():
        capitol = row['CAPITOL']
        subcapitol = row['SUBCAPITOL'] if pd.notna(row['SUBCAPITOL']) else ''
        
        # Initialize capitol if it doesn't exist
        if capitol not in nested_dict:
            nested_dict[capitol] = {}
            
        # Initialize subcapitol if it doesn't exist
        if subcapitol not in nested_dict[capitol]:
            nested_dict[capitol][subcapitol] = {
                'paragraf': row['PARAGRAF'] if pd.notna(row['PARAGRAF']) else '',
                'grupa': row['GRUPA'] if pd.notna(row['GRUPA']) else '',
                'denumire': row['DENUMIRE'],
                'program_2025': row['PROGRAM_2025'],
                'estimari2026': row['ESTIMARI2026'],
                'estimari2027': row['ESTIMARI2027'],
                'estimari2028': row['ESTIMARI2028']
            }
    
    return nested_dict



def get_romanian_budget(file_path, target_elements):
    df = pd.DataFrame(parse_large_xml(file_path, target_elements))
    df.drop(['TITLU_RAPORT', 'ANEXA', 'COD_ORDONATOR', 'ORDONATOR'], axis=1, inplace=True)

    return df


In [43]:
file_path = 'anexa1_bs_2025.xml'
target_elements = ['G_TITLU_RAPORT']  # Adjust based on your XML structure

df = get_romanian_budget(file_path, target_elements)
# Convert the DataFrame to nested dictionary
budget_dict = convert_df_to_nested_dict(df)

# Example usage:
# Get all subcapitols for a specific capitol
print("Subcapitols for capitol '6501':")
print(budget_dict['6501'].keys())

# Get details for a specific subcapitol
print("\nDetails for capitol '6501', subcapitol '01':")
print(budget_dict['6501']['01'])

Subcapitols for capitol '6501':
dict_keys(['', '  ', '01', '02', '03', '04', '05', '06', '07', '11', '13', '50'])

Details for capitol '6501', subcapitol '01':
{'paragraf': '', 'grupa': '', 'denumire': ' Administratie centrala', 'program_2025': None, 'estimari2026': None, 'estimari2027': None, 'estimari2028': None}


In [44]:
budget_dict.keys()

dict_keys(['0001', '0002', '0003', '0004', '0005', '0101', '0201', '0300', '0301', '0401', '0500', '0501', '0700', '0701', '1000', '1001', '1101', '1201', '1401', '1601', '1700', '1701', '2000', '2001', '2101', '2900', '3000', '3001', '3101', '3300', '3301', '3401', '3501', '3601', '3900', '3901', '4501', '4801', '4901', '5001', '5100', '5101', '5301', '5401', '5501', '5601', '6000', '6001', '6101', '6500', '6501', '6601', '6701', '6801', '7000', '7001', '7401', '8000', '8001', '8101', '8201', '8301', '8401', '8501', '8601', '8701', '9901'])

In [45]:
budget_dict

{'0001': {'01': {'paragraf': '',
   'grupa': '',
   'denumire': 'VENITURI - TOTAL',
   'program_2025': '357.353.033',
   'estimari2026': '349.169.360',
   'estimari2027': '349.399.128',
   'estimari2028': '367.798.364'}},
 '0002': {'01': {'paragraf': '  ',
   'grupa': '',
   'denumire': 'I.VENITURI CURENTE',
   'program_2025': '278.999.496',
   'estimari2026': '296.761.619',
   'estimari2027': '317.534.856',
   'estimari2028': '339.093.921'}},
 '0003': {'01': {'paragraf': '  ',
   'grupa': '',
   'denumire': 'A.VENITURI FISCALE',
   'program_2025': '234.103.618',
   'estimari2026': '251.711.677',
   'estimari2027': '270.005.482',
   'estimari2028': '289.075.837'}},
 '0004': {'01': {'paragraf': '  ',
   'grupa': '',
   'denumire': 'A1.IMPOZIT PE VENIT, PROFIT SI CASTIGURI DIN CAPITAL',
   'program_2025': '60.729.767',
   'estimari2026': '65.390.577',
   'estimari2027': '69.551.838',
   'estimari2028': '73.713.645'}},
 '0005': {'01': {'paragraf': '  ',
   'grupa': '',
   'denumire': 'A11

In [38]:
df[df['PARAGRAF'] == '01']


Unnamed: 0,CAPITOL,SUBCAPITOL,PARAGRAF,GRUPA,ARTICOL,ALINEAT,DENUMIRE,PROGRAM_2025,ESTIMARI2026,ESTIMARI2027,ESTIMARI2028
312,5101,1,1,,,,Administratia prezidentiala,,,,
313,5101,1,1,,,,II.Credite bugetare,89.915,,,
522,6001,50,1,,,,Structuri militare nationale,,,,
523,6001,50,1,,,,II.Credite bugetare,48.389,,,
575,6101,3,1,,,,Politie,,,,
576,6101,3,1,,,,II.Credite bugetare,8.762.874,,,
691,6501,3,1,,,,Invatamant prescolar,,,,
692,6501,3,1,,,,II.Credite bugetare,7.759.670,,,
697,6501,4,1,,,,Invatamant secundar inferior,,,,
698,6501,4,1,,,,II.Credite bugetare,9.933.827,,,


In [35]:
df[df['PARAGRAF'].isnull()]


Unnamed: 0,CAPITOL,SUBCAPITOL,PARAGRAF,GRUPA,ARTICOL,ALINEAT,DENUMIRE,PROGRAM_2025,ESTIMARI2026,ESTIMARI2027,ESTIMARI2028
0,0001,01,,,,,VENITURI - TOTAL,357.353.033,349.169.360,349.399.128,367.798.364
5,0101,,,,,,IMPOZIT PE PROFIT,41.160.812,46.228.568,49.629.181,53.036.140
6,0101,01,,,,,Impozit pe profit de la agentii economici,39.259.016,44.104.350,47.283.538,50.527.169
7,0101,02,,,,,Impozit pe profit de la bancile comerciale,3.254.672,3.517.681,3.775.336,4.038.214
8,0101,03,,,,,Sume redirectionate din impozitul pe profit (...,-1.352.876,-1.393.463,-1.429.693,-1.529.243
...,...,...,...,...,...,...,...,...,...,...,...
1446,8701,,,,,,I.Credite de angajament,26.803,26.803,26.803,26.803
1447,8701,,,,,,II.Credite bugetare,26.803,26.803,26.803,26.803
1457,8701,04,,,,,Turism,,,,
1458,8701,04,,,,,II.Credite bugetare,26.803,,,


In [5]:
context = etree.iterparse("anexa1_bs_2025.xml", events=('end',), tag="G_TITLU_RAPORT")


In [None]:
capitol: {
    subcapitol: {
        paragraf: 
        grupa: 
        articol: 
        alineat: 
        denumire: 
        program_2025: 
        estimari2026: 
        estimari2027: 
        estimari2028: 
    }
}

In [9]:
data = parse_large_xml("anexa1_bs_2025.xml", ["G_TITLU_RAPORT"])

In [16]:
list_of_chapters = []

for element in data:
    list_of_chapters.append([element['CAPITOL'], element['SUBCAPITOL']])
    print(element['CAPITOL'])
    print(element['SUBCAPITOL'])

0001
01
0002
01
0003
01
0004
01
0005
01
0101
None
0101
01
0101
02
0101
03
0201
None
0201
03
0201
06
0201
49
0300
01
0301
None
0301
02
0301
03
0301
04
0301
05
0301
06
0301
07
0301
09
0301
11
0301
16
0301
18
0301
19
0301
21
0301
22
0301
23
0301
24
0301
26
0301
27
0301
50
0301
51
0301
60
0401
None
0401
01
0500
01
0501
None
0501
01
0700
01
0701
None
0701
04
0701
06
1000
01
1001
None
1101
None
1101
01
1101
02
1101
05
1101
06
1101
09
1201
None
1201
11
1201
12
1201
13
1201
18
1201
21
1201
22
1401
None
1601
None
1601
01
1601
03
1601
04
1601
50
1700
01
1701
None
1701
01
2000
01
2001
None
2001
05
2001
08
2101
None
2101
50
2900
01
3000
01
3001
None
3001
01
3001
04
3001
05
3001
08
3001
11
3001
50
3101
None
3101
03
3300
01
3301
None
3301
02
3301
08
3301
09
3301
16
3301
17
3301
21
3301
25
3301
26
3301
29
3301
50
3401
None
3401
01
3401
50
3501
None
3501
01
3501
02
3501
03
3501
04
3501
05
3501
06
3501
07
3501
08
3501
50
3601
None
3601
01
3601
02
3601
03
3601
11
3601
14
3601
16
3601
19
3601
28
3601
32


In [29]:
# Example usage


In [34]:
df

Unnamed: 0,CAPITOL,SUBCAPITOL,PARAGRAF,GRUPA,ARTICOL,ALINEAT,DENUMIRE,PROGRAM_2025,ESTIMARI2026,ESTIMARI2027,ESTIMARI2028
0,0001,01,,,,,VENITURI - TOTAL,357.353.033,349.169.360,349.399.128,367.798.364
1,0002,01,,,,,I.VENITURI CURENTE,278.999.496,296.761.619,317.534.856,339.093.921
2,0003,01,,,,,A.VENITURI FISCALE,234.103.618,251.711.677,270.005.482,289.075.837
3,0004,01,,,,,"A1.IMPOZIT PE VENIT, PROFIT SI CASTIGURI DIN C...",60.729.767,65.390.577,69.551.838,73.713.645
4,0005,01,,,,,"A11.IMPOZIT PE VENIT, PROFIT SI CASTIGURI DIN ...",44.654.554,48.945.383,52.411.055,55.879.628
...,...,...,...,...,...,...,...,...,...,...,...
1455,8701,,,51,,,I.Credite de angajament,10.000,10.000,10.000,10.000
1456,8701,,,51,,,II.Credite bugetare,10.000,10.000,10.000,10.000
1457,8701,04,,,,,Turism,,,,
1458,8701,04,,,,,II.Credite bugetare,26.803,,,


In [None]:
df['CAPITOL'].value_counts()

CAPITOL
6501    82
8401    77
6801    73
8301    71
6101    69
        ..
0004     1
1000     1
4501     1
4801     1
9901     1
Name: count, Length: 67, dtype: int64

In [38]:
df[df['CAPITOL'] == '6501']

Unnamed: 0,CAPITOL,SUBCAPITOL,PARAGRAF,GRUPA,ARTICOL,ALINEAT,DENUMIRE,PROGRAM_2025,ESTIMARI2026,ESTIMARI2027,ESTIMARI2028
643,6501,,,,,,INVATAMANT,,,,
644,6501,,,,,,I.Credite de angajament,71.891.918,57.643.428,59.714.423,62.766.716
645,6501,,,,,,II.Credite bugetare,60.295.342,63.585.958,59.617.088,62.588.053
646,6501,,,01,,,CHELTUIELI CURENTE,,,,
647,6501,,,01,,,I.Credite de angajament,71.412.372,57.429.188,59.406.534,62.459.461
...,...,...,...,...,...,...,...,...,...,...,...
720,6501,11,30,,,,II.Credite bugetare,590.594,,,
721,6501,13,,,,,Invatamant anteprescolar,,,,
722,6501,13,,,,,II.Credite bugetare,576.345,,,
723,6501,50,,,,,Alte cheltuieli in domeniul invatamantului,,,,


In [40]:
temp = parse_large_xml(file_path, target_elements)

In [45]:
temp[0]

{'TITLU_RAPORT': 'BUGETUL DE STAT',
 'ANEXA': 'Anexa nr.1',
 'COD_ORDONATOR': '00',
 'ORDONATOR': 'Total',
 'CAPITOL': '0001',
 'SUBCAPITOL': '01',
 'PARAGRAF': None,
 'GRUPA': None,
 'ARTICOL': None,
 'ALINEAT': None,
 'DENUMIRE': 'VENITURI - TOTAL',
 'PROGRAM_2025': '357.353.033',
 'ESTIMARI2026': '349.169.360',
 'ESTIMARI2027': '349.399.128',
 'ESTIMARI2028': '367.798.364'}

In [51]:
temp[1]

{'TITLU_RAPORT': 'BUGETUL DE STAT',
 'ANEXA': 'Anexa nr.1',
 'COD_ORDONATOR': '00',
 'ORDONATOR': 'Total',
 'CAPITOL': '0002',
 'SUBCAPITOL': '01',
 'PARAGRAF': '  ',
 'GRUPA': None,
 'ARTICOL': None,
 'ALINEAT': None,
 'DENUMIRE': 'I.VENITURI CURENTE',
 'PROGRAM_2025': '278.999.496',
 'ESTIMARI2026': '296.761.619',
 'ESTIMARI2027': '317.534.856',
 'ESTIMARI2028': '339.093.921'}