# Data preparation

In [1]:
import xml.etree.ElementTree as ET
from os import walk
from src.vars import DATA_DIR, OUTPUT_DIR

import pandas as pd
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

## Complementary info extraction

In [2]:
# getting filenames in folder

filenames_2022 = [f"{DATA_DIR}/2022/{f}" for f in next(walk(f"{DATA_DIR}/2022"), (None, None, []))[2]]

In [3]:
# extraction of the "complementary info" section

df = pd.DataFrame(columns=['case_id', 'complementary_info'])

for f in filenames_2022:
    tree = ET.parse(f)
    root = tree.getroot()

    case_id = 0
    complementary_info = ''

    for tag1 in root:
        tag = tag1.tag[tag1.tag.index('}')+1:]
        if tag == 'CODED_DATA_SECTION':
            for tag2 in tag1:
                if tag2.tag[tag2.tag.index('}')+1:] == 'NOTICE_DATA':
                    for tag3 in tag2:
                        if tag3.tag[tag3.tag.index('}')+1:] == 'NO_DOC_OJS':
                            case_id = tag3.text
                            case_id = case_id[case_id.index(' ')+1:]
        elif tag == 'FORM_SECTION':
            for form in tag1:
                for section in form:
                    if section.tag[section.tag.index('}')+1:] == 'COMPLEMENTARY_INFO':
                        for subsection in section:
                            if subsection.tag[subsection.tag.index('}')+1:] == 'INFO_ADD':
                                for p in subsection:
                                    complementary_info += f"\n{' '.join(p.text.split())}"

    if complementary_info == '':
        continue
    df.loc[len(df)] = [case_id, complementary_info[1:]]

df.to_csv(f'{OUTPUT_DIR}/df_complementary_info_raw.csv', index=False)

In [4]:
# data filtering: only instances with dates inside (forma dd/mm/yyyy)

date_pattern = r'.*\d{2}[./-]\d{2}[./-]\d{4}.*'
mask = df['complementary_info'].astype(str).str.contains(date_pattern, regex=True)
df_complementary_info_filter_dates = df[mask]
df_complementary_info_filter_dates.to_csv(f'{OUTPUT_DIR}/df_complementary_info_filter_dates.csv', index=False)

In [5]:
nltk.download('punkt_tab')
stemmer = SnowballStemmer('italian')

[nltk_data] Downloading package punkt_tab to /home/ivsnp/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
# keywords and acronyms to filter
KEYWORDS = [
    'delibera', 'deliberazione', 'deliberare',
    'provvedimento',
    'decreto',
    'determinare', 'determina', 'determinazione', 'det', 'determinativo',
    'atto', 'atti',
    'direttiva',
    'ordinanza',
    'comunicato',
    'disposizione'
]
KEYWORDS_STEMS = list(set([stemmer.stem(k) for k in KEYWORDS]))

ACRONYMS = ["DL", "DM", "DCM", "DPCM", "DLGS", "DAR", "DCR", "DGR", "DUPCR", "DCCR", "DPGR", "DDS", "DSGR", "DDUO", "DDG", "DCIPE", "CU", "Direttiva", "OCDPC", "Comunicato", "DSM", "DCMM", "DCoM", "DA", "DCC", "DCUC", "DGC", "DCS", "DD", "DRA", "DDP", "DPP", "DCP", "DGP", "CdG", "DComP"]
ACRONYMS = [a.lower() for a in ACRONYMS]
ACRONYMS = ['.'.join(a) for a in ACRONYMS]

def df_filtering(row):
    text = row['complementary_info'].lower()
    stemmed_text = [stemmer.stem(token) for token in word_tokenize(text)]

    if any(a in text for a in ACRONYMS) or any(s in stemmed_text for s in KEYWORDS_STEMS):
        return True
    else:
        return False

In [7]:
# filtering teds with dates, keywords (stems) and acronims
mask = df_complementary_info_filter_dates.apply(df_filtering, axis=1)
df_complementary_info_filter_dates_keywords_acronims = df_complementary_info_filter_dates[mask]
df_complementary_info_filter_dates_keywords_acronims.to_csv(f'{OUTPUT_DIR}/df_complementary_info_filter_dates_keywords_acronims.csv', index=False)

In [8]:
print(f"TOT Tender Electronic Daily Documents: ", len(filenames_2022))
print(f"Tender Electronic Daily Documents with complementary info: ", len(df.index))
print(f"Tender Electronic Daily Documents with dates (dd/mm/yyy) in complementary info: ", len(df_complementary_info_filter_dates.index))
print(f"Tender Electronic Daily Documents with dates (dd/mm/yyy), keywords and acronyms in complementary info: ", len(df_complementary_info_filter_dates_keywords_acronims.index))

TOT Tender Electronic Daily Documents:  27841
Tender Electronic Daily Documents with complementary info:  10781
Tender Electronic Daily Documents with dates (dd/mm/yyy) in complementary info:  3505
Tender Electronic Daily Documents with dates (dd/mm/yyy), keywords and acronyms in complementary info:  2839


In [9]:
df_complementary_info_filter_dates_keywords_acronims

Unnamed: 0,case_id,complementary_info
2,219-630869,Atto esito di gara: Provvedimento DG n. 36 del...
3,123-350412,RUP: dott.ssa Maria Lomboni. Delibera di aggiu...
5,153-437453,"1) il bando di gara, il fac-simile modulo di o..."
9,243-697432,Determinazione Agg.ne AD di Marche Multiserviz...
11,035-089020,1) La procedura è gestita con il Sistema telem...
...,...,...
10755,204-580451,Determinazione e contrattare e contestuale ind...
10759,248-722494,a) A pena di irregolarità non sanabile delle o...
10763,040-102463,Il disciplinare di gara (contenente le norme i...
10770,090-248021,1 Le disposizioni integrative del presente Ban...
