In [96]:
from opensearchpy import OpenSearch, helpers
import pandas as pd
import numpy as np
import os
import math
import warnings
from datetime import datetime
import wget
import zipfile
import shutil
import getpass
def transform_id(id_csv): #Function to swap the ID_NOTICE_CAN field so it aligns with the one used in the xml format
    year_part = str(id_csv)[:4]
    id_xml = str(id_csv)[4:].zfill(6) + '-' + year_part
    return id_xml

def flatten_csv(folder_path):

    
    columns_can_level = ['ID_NOTICE_CAN', 'TED_NOTICE_URL', 'YEAR', 'ID_TYPE', 'DT_DISPATCH', 'XSD_VERSION',
                         'CANCELLED',
                         'CORRECTIONS', 'B_MULTIPLE_CAE', 'CAE_NAME', 'CAE_NATIONALID', 'CAE_ADDRESS', 'CAE_TOWN',
                         'CAE_POSTAL_CODE', 'CAE_GPA_ANNEX', 'ISO_COUNTRY_CODE', 'ISO_COUNTRY_CODE_GPA',
                         'B_MULTIPLE_COUNTRY',
                         'ISO_COUNTRY_CODE_ALL', 'CAE_TYPE', 'EU_INST_CODE', 'MAIN_ACTIVITY',
                         'B_ON_BEHALF', 'B_INVOLVES_JOINT_PROCUREMENT', 'B_AWARDED_BY_CENTRAL_BODY', 'TYPE_OF_CONTRACT',
                         'B_FRA_AGREEMENT', 'FRA_ESTIMATED', 'B_DYN_PURCH_SYST', 'CPV', 'MAIN_CPV_CODE_GPA',
                         'B_GPA', 'GPA_COVERAGE', 'LOTS_NUMBER', 'VALUE_EURO', 'VALUE_EURO_FIN_1', 'VALUE_EURO_FIN_2',
                         'TOP_TYPE',
                         'B_ACCELERATED', 'OUT_OF_DIRECTIVES', 'B_ELECTRONIC_AUCTION', 'NUMBER_AWARDS']
    dtypes = {
        'ID_NOTICE_CAN': 'str',
        'TED_NOTICE_URL': 'str',
        'YEAR': 'int',
        'ID_TYPE': 'str',
        'DT_DISPATCH': 'str',  # Dates can be converted later if needed
        'XSD_VERSION': 'str',
        'CANCELLED': 'bool',
        'CORRECTIONS': 'int',
        'CAE_NAME': 'str',
        'CAE_NATIONALID': 'str',
        'CAE_ADDRESS': 'str',
        'CAE_TOWN': 'str',
        'CAE_POSTAL_CODE': 'str',
        'CAE_GPA_ANNEX': 'str',
        'ISO_COUNTRY_CODE': 'str',
        'ISO_COUNTRY_CODE_GPA': 'str',
        'ISO_COUNTRY_CODE_ALL': 'str',
        'CAE_TYPE': 'str',
        'EU_INST_CODE': 'str',
        'MAIN_ACTIVITY': 'str',
        'TYPE_OF_CONTRACT': 'str',
        'FRA_ESTIMATED': 'str',
        'CPV': 'str',
        'MAIN_CPV_CODE_GPA': 'str',
        'GPA_COVERAGE': 'str',
        'VALUE_EURO': 'float',
        'VALUE_EURO_FIN_1': 'float',
        'VALUE_EURO_FIN_2': 'float',
        'TOP_TYPE': 'str',
        'OUT_OF_DIRECTIVES': 'bool'
    }
    bool_cols = ['B_MULTIPLE_CAE', 'B_MULTIPLE_COUNTRY', "B_ON_BEHALF", "B_INVOLVES_JOINT_PROCUREMENT",
                 "B_AWARDED_BY_CENTRAL_BODY", "B_FRA_AGREEMENT", "B_DYN_PURCH_SYST", "B_GPA", "B_ACCELERATED",
                 "B_ELECTRONIC_AUCTION"]
    converters = {col: bool_converter for col in bool_cols}
    converters['LOTS_NUMBER'] = lots_converter
    dfs = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv') and file_name.startswith('export_CAN'):
            file_path = os.path.join(folder_path, file_name)
            current_df = pd.read_csv(file_path, usecols=columns_can_level, dtype=dtypes, converters=converters)
            dfs.append(current_df)
    df = pd.concat(dfs, ignore_index=True)

    df_flat = df.groupby('ID_NOTICE_CAN').first()
    df_flat.index = df_flat.index.to_series().apply(transform_id)
    return df_flat



In [97]:
df = flatten_csv("C:/Users/afont/OneDrive/Escritorio/PROCURE/Data/TED")

In [98]:
df.dtypes

TED_NOTICE_URL                   object
YEAR                              int32
ID_TYPE                          object
DT_DISPATCH                      object
XSD_VERSION                      object
CANCELLED                          bool
CORRECTIONS                       int32
B_MULTIPLE_CAE                     bool
CAE_NAME                         object
CAE_NATIONALID                   object
CAE_ADDRESS                      object
CAE_TOWN                         object
CAE_POSTAL_CODE                  object
CAE_GPA_ANNEX                    object
ISO_COUNTRY_CODE                 object
ISO_COUNTRY_CODE_GPA             object
B_MULTIPLE_COUNTRY                 bool
ISO_COUNTRY_CODE_ALL             object
CAE_TYPE                         object
EU_INST_CODE                     object
MAIN_ACTIVITY                    object
B_ON_BEHALF                        bool
B_INVOLVES_JOINT_PROCUREMENT       bool
B_AWARDED_BY_CENTRAL_BODY          bool
TYPE_OF_CONTRACT                 object


In [93]:

i = 33
print(df.columns[i])
print(df.iloc[:, i].unique())

na_values = df.iloc[:, i].isna()
print(df.loc[na_values, df.columns[i]])

ID_LOT
['2' nan '1' ... 72.0 76.0 77.0]
1          NaN
2          NaN
3          NaN
4          NaN
5          NaN
          ... 
6198052    NaN
6198053    NaN
6198060    NaN
6198061    NaN
6198062    NaN
Name: ID_LOT, Length: 1617794, dtype: object


In [71]:
def bool_converter(value):
    if pd.isnull(value) or value =='' or pd.isna(value):  # Handle NaN or None values
        return False  # or handle differently based on your case
    elif value == 'Y':
        return True
    elif value == 'N':
        return False
    else:
        raise ValueError(f"Unexpected value '{value}' found in the column.")
def lots_converter(value):
    if pd.isnull(value) or value == '' or pd.isna(value):
        return 0  # Handle NaN, None, or empty string values (assuming 0 is a default value)
    elif value.isdigit():
        return int(value) 
    else:
        raise ValueError(f"Unexpected value '{value}' found in the column.")

In [77]:
dtypes = {
        'ID_NOTICE_CAN': 'str',
        'TED_NOTICE_URL': 'str',
        'YEAR': 'int',
        'ID_TYPE': 'str',
        'DT_DISPATCH': 'str',  # Dates can be converted later if needed
        'XSD_VERSION': 'str',
        'CANCELLED': 'bool',
        'CORRECTIONS': 'int',
        'CAE_NAME': 'str',
        'CAE_NATIONALID': 'str',
        'CAE_ADDRESS': 'str',
        'CAE_TOWN': 'str',
        'CAE_POSTAL_CODE': 'str',
        'CAE_GPA_ANNEX': 'str',
        'ISO_COUNTRY_CODE': 'str',
        'ISO_COUNTRY_CODE_GPA': 'str',
        'ISO_COUNTRY_CODE_ALL': 'str',
        'CAE_TYPE': 'str',
        'EU_INST_CODE': 'str',
        'MAIN_ACTIVITY': 'str',
        'TYPE_OF_CONTRACT': 'str',
        'FRA_ESTIMATED': 'str',
        'CPV': 'str',
        'MAIN_CPV_CODE_GPA': 'str',
        'GPA_COVERAGE': 'str',
        'VALUE_EURO': 'float',
        'VALUE_EURO_FIN_1': 'float',
        'VALUE_EURO_FIN_2': 'float',
        'TOP_TYPE': 'str',
        'OUT_OF_DIRECTIVES': 'bool'
    }
bool_cols = ['B_MULTIPLE_CAE','B_MULTIPLE_COUNTRY',"B_ON_BEHALF","B_INVOLVES_JOINT_PROCUREMENT","B_AWARDED_BY_CENTRAL_BODY","B_FRA_AGREEMENT","B_DYN_PURCH_SYST","B_GPA","B_ACCELERATED","B_ELECTRONIC_AUCTION"]
converters = {col: bool_converter for col in bool_cols}
converters['LOTS_NUMBER'] = lots_converter
pd.read_csv("C:/Users/afont/OneDrive/Escritorio/PROCURE/Data/TED/export_CAN_2018.csv", dtype = dtypes, converters= converters)

Unnamed: 0,ID_NOTICE_CAN,TED_NOTICE_URL,YEAR,ID_TYPE,DT_DISPATCH,XSD_VERSION,CANCELLED,CORRECTIONS,B_MULTIPLE_CAE,CAE_NAME,...,NUMBER_OFFERS,NUMBER_TENDERS_SME,NUMBER_TENDERS_OTHER_EU,NUMBER_TENDERS_NON_EU,NUMBER_OFFERS_ELECTR,AWARD_EST_VALUE_EURO,AWARD_VALUE_EURO,AWARD_VALUE_EURO_FIN_1,B_SUBCONTRACTED,DT_AWARD
0,20184,ted.europa.eu/udl?uri=TED:NOTICE:4-2018:TEXT:E...,2018,3,22/12/17,R209.S2,False,0,False,European Insurance and Occupational Pensions A...,...,,,,,,,,0.00,,
1,20185,ted.europa.eu/udl?uri=TED:NOTICE:5-2018:TEXT:E...,2018,3,22/12/17,R209.S2,False,0,False,European Food Safety Authority (EFSA),...,2.0,2.0,0.0,0.0,,1500000.0,1500000.00,1500000.00,N,18/12/17
2,20185,ted.europa.eu/udl?uri=TED:NOTICE:5-2018:TEXT:E...,2018,3,22/12/17,R209.S2,False,0,False,European Food Safety Authority (EFSA),...,2.0,2.0,0.0,0.0,,1500000.0,1500000.00,1500000.00,N,18/12/17
3,201858,ted.europa.eu/udl?uri=TED:NOTICE:58-2018:TEXT:...,2018,3,22/12/17,R209.S2,False,0,False,European Commission-Directorate General for En...,...,1.0,,,,,2000000.0,1464400.00,1464400.00,N,13/11/17
4,201859,ted.europa.eu/udl?uri=TED:NOTICE:59-2018:TEXT:...,2018,3,22/12/17,R209.S2,False,0,False,"European Commission, Directorate-General for R...",...,3.0,,,,,275000.0,270750.00,270750.00,Y,20/12/17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
804035,2018578487,ted.europa.eu/udl?uri=TED:NOTICE:578487-2018:T...,2018,3,28/12/18,R209.S3,False,0,False,ECOPRO AS,...,1.0,,,,1.0,416775.2,568637.67,568637.67,N,19/12/18
804036,2018578488,ted.europa.eu/udl?uri=TED:NOTICE:578488-2018:T...,2018,3,28/12/18,R209.S3,False,0,False,Haugaland Vekst,...,5.0,,,,5.0,,343839.54,343839.54,N,01/07/18
804037,2018578494,ted.europa.eu/udl?uri=TED:NOTICE:578494-2018:T...,2018,21,28/12/18,R209.S3,False,0,False,Universitetet i Stavanger,...,3.0,,,,,,520969.00,520969.00,N,19/12/18
804038,2018578495,ted.europa.eu/udl?uri=TED:NOTICE:578495-2018:T...,2018,21,28/12/18,R209.S3,False,0,False,Skatteetaten,...,4.0,,,,,1041938.0,1041938.00,1041938.00,N,


In [84]:
print(df_nodtype.dtypes)
for i in df_nodtype.columns.len():
    print(df_nodtype.columns[i])
    print(df_nodtype.iloc[:, i].unique())
    
    na_values = df_nodtype.iloc[:, i].isna()
    print(df_nodtype.loc[na_values, df_nodtype.columns[i]])

TED_NOTICE_URL                   object
YEAR                              int32
ID_TYPE                          object
DT_DISPATCH                      object
XSD_VERSION                      object
CANCELLED                          bool
CORRECTIONS                       int32
B_MULTIPLE_CAE                     bool
CAE_NAME                         object
CAE_NATIONALID                   object
CAE_ADDRESS                      object
CAE_TOWN                         object
CAE_POSTAL_CODE                  object
CAE_GPA_ANNEX                    object
ISO_COUNTRY_CODE                 object
ISO_COUNTRY_CODE_GPA             object
B_MULTIPLE_COUNTRY                 bool
ISO_COUNTRY_CODE_ALL             object
CAE_TYPE                         object
EU_INST_CODE                     object
MAIN_ACTIVITY                    object
B_ON_BEHALF                        bool
B_INVOLVES_JOINT_PROCUREMENT       bool
B_AWARDED_BY_CENTRAL_BODY          bool
TYPE_OF_CONTRACT                 object


AttributeError: 'Index' object has no attribute 'len'