# Setup

In [None]:
import pandas as pd
import numpy as np
import os

import networkx as nx

from tqdm import tqdm_notebook
import tqdm

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
%matplotlib inline

pd.options.display.float_format = '{:,.3f}'.format

# Load data for all competitive porcurements

In [None]:
# load data
data_dir = "/kaggle/input/prozorro-public-procurement-dataset/"
data_competitive = "Competitive_procurements.csv"
data_comp = pd.read_csv(os.path.join(data_dir, data_competitive), index_col=0, dtype="str")

# change variables format
data_comp[["lot_initial_value", "lot_final_value"]] = data_comp[["lot_initial_value", "lot_final_value"]].astype(float)
data_comp.index = pd.to_datetime(data_comp.index)
data_comp.loc[:, 'lot_announce_year'] = data_comp.lot_announce_year.astype('int')
data_comp.loc[:, 'supplier_dummy'] = data_comp.supplier_dummy.astype('int')

In [None]:
data_comp = data_copy.copy()

### Filtering

In [None]:
# Keep above threshold tenders only (they are always competitive, i.e. >=2 participants per auction)
obs_before = len(data_comp)
data_comp = data_comp[data_comp["lot_procur_type"].isin(['Above Threshold UA', 'Above Threshold EU'])]
print(f"Keep only above threshold tenders")
print(f"Discarded {obs_before - len(data_comp):,.0f} observations. Left {len(data_comp):,.0f} observations.")

# Keep only first and last "full" year. We will explore the difference between them
# In 2016 Prozorro was only started to be adopted
display(data_comp.groupby("lot_announce_year")['lot_id'].count())
obs_before = len(data_comp)
data_comp = data_comp[data_comp["lot_announce_year"].isin([2017, 2019])]
print(f"Keep only first and last 'full' year")
print(f"Discarded {obs_before - len(data_comp):,.0f} observations. Left {len(data_comp):,.0f} observations.\n")

# Keep markets with at least 10 tenders in 2017 or 2019
# For markets with a very small number of tenders, one additional tender can cause
# a big change in the market structure. We will look only at more 'robust' markets
# At the same time we do not want to miss cases, when a new market emerged or stopped functioning.
# So our criteria is 10 tenders in 2017, 2019 or in both.
markets_before = data_comp["lot_cpv_4_digs"].nunique()
obs_before = len(data_comp)

selection = pd.pivot_table(data=data_comp, columns="lot_announce_year", index="lot_cpv_4_digs", values="lot_id", aggfunc="nunique")
selected_list = selection[(selection>=10).sum(axis=1)>=1].index
data_comp = data_comp[data_comp["lot_cpv_4_digs"].isin(selected_list)]

print(f"Keep markets with at least 10 tenders in 2017 or in 2019")
print(f"Discarded {obs_before - len(data_comp):,.0f} observations. Left {len(data_comp):,.0f} observations.")
print(f"Discarded {markets_before - data_comp['lot_cpv_4_digs'].nunique():,.0f} markets. Left {data_comp['lot_cpv_4_digs'].nunique():,.0f} markets.")

print(f"The shape of the DF: {data_comp.shape[0]:,.0f} rows, {data_comp.shape[1]:,.0f} columns")
display(data_comp.head(5).T)

# Non-network metrics

In [None]:
# number of procurements per market per year
df_metrics = pd.pivot_table(data=data_comp, columns="lot_announce_year", index="lot_cpv_4_digs", values="lot_id", aggfunc="nunique")
df_metrics.columns = ["_".join(["tenders_number", str(col)]) for col in df_metrics.columns]
df_metrics.fillna(0, inplace=True)

for col in df_metrics:
    df_metrics.loc[:, col] = df_metrics.loc[:, col].astype('int')
    

# sum of contracts per market per year
df_contracts = pd.pivot_table(data=data_comp.query("supplier_dummy == 1"), columns="lot_announce_year", index="lot_cpv_4_digs", values="lot_final_value", aggfunc="sum")
df_contracts.columns = ["_".join(["contracts_value", str(col)]) for col in df_contracts.columns]
df_contracts.fillna(0, inplace=True)


# median of contract per market per year
df_contracts_median = pd.pivot_table(data=data_comp.query("supplier_dummy == 1"), columns="lot_announce_year", index="lot_cpv_4_digs", values="lot_final_value", aggfunc="median")
df_contracts_median.columns = ["_".join(["contract_median", str(col)]) for col in df_contracts_median.columns]
df_contracts_median.fillna(0, inplace=True)


# unique organizers per market per year
df_organizers = pd.pivot_table(data=data_comp, columns="lot_announce_year", index="lot_cpv_4_digs", values="organizer_code", aggfunc="nunique")
df_organizers.columns = ["_".join(["org_number", str(col)]) for col in df_organizers.columns]
df_organizers.fillna(0, inplace=True)

for col in df_organizers:
    df_organizers.loc[:, col] = df_organizers.loc[:, col].astype('int')


# unique participants per market per year
df_participants = pd.pivot_table(data=data_comp, columns="lot_announce_year", index="lot_cpv_4_digs", values="participant_code", aggfunc="nunique")
df_participants.columns = ["_".join(["part_number", str(col)]) for col in df_participants.columns]
df_participants.fillna(0, inplace=True)

for col in df_participants:
    df_participants.loc[:, col] = df_participants.loc[:, col].astype('int')
    
    
# unique suppliers per market per year
df_winners = pd.pivot_table(data=data_comp.query("supplier_dummy == 1"), columns="lot_announce_year", index="lot_cpv_4_digs", values="participant_code", aggfunc="nunique")
df_winners.columns = ["_".join(["winners_number", str(col)]) for col in df_winners.columns]
df_winners.fillna(0, inplace=True)

for col in df_winners:
    df_winners.loc[:, col] = df_winners.loc[:, col].astype('int')
    
    
# collect metrics together
df_metrics = pd.merge(df_metrics, df_contracts, left_index=True, right_index=True)
df_metrics = pd.merge(df_metrics, df_contracts_median, left_index=True, right_index=True)
df_metrics = pd.merge(df_metrics, df_organizers, left_index=True, right_index=True)
df_metrics = pd.merge(df_metrics, df_participants, left_index=True, right_index=True)
df_metrics = pd.merge(df_metrics, df_winners, left_index=True, right_index=True)

### A detailed analysis for each market

In [None]:
def calc_hirshman_per_year(df, entity_column):
    '''
    Calculates Herfindahl–Hirschman index per year for a selected group of entities defined by entity_column
    '''
    df_pivot = pd.pivot_table(data=df, columns="lot_announce_year",
                              index=entity_column, values="lot_final_value", aggfunc="sum")
    df_pivot.fillna(0, inplace=True)

    # market share of each entity
    df_pivot = df_pivot / df_pivot.sum()

    # square of market share
    df_pivot = df_pivot * df_pivot
    
    # sum columns - get index
    res = pd.DataFrame(df_pivot.sum()).T
    res.index = [f'hirsh index {entity_column}']
    
    if 2017 not in df_pivot.columns:
        res.loc[:, 2017] = None
    
    if 2019 not in df_pivot.columns:
        res.loc[:, 2019] = None

    return res

In [None]:
def analyze_market(data_sub, market, df_summary=None):
    '''
    Calculate market metrics that need to be calculated separately (hard to vectorize) 
    '''
    
    # number of participants that were active in both years
    part_2017 = set(data_sub.query("lot_announce_year == 2017").participant_code.unique())
    part_2019 = set(data_sub.query("lot_announce_year == 2019").participant_code.unique())
    part_intersection = part_2019.intersection(part_2017)
    part_persistent = len(part_intersection)


    # number of organizers that were active in both years
    org_2017 = set(data_sub.query("lot_announce_year == 2017").organizer_code.unique())
    org_2019 = set(data_sub.query("lot_announce_year == 2019").organizer_code.unique())
    org_intersection = org_2019.intersection(org_2017)
    org_persistent = len(org_intersection)


    # supplier with the largest number of contracts
    max_number_supplier = pd.pivot_table(data=data_sub.query("supplier_dummy == 1"), 
                                         columns="lot_announce_year", index="participant_code", values="lot_id", 
                                         aggfunc="nunique")

    max_number_supplier_2017_name = max_number_supplier[2017].idxmax() if 2017 in max_number_supplier.columns else None
    max_number_supplier_2017_value = max_number_supplier[2017].max() if 2017 in max_number_supplier.columns else None

    max_number_supplier_2019_name = max_number_supplier[2019].idxmax() if 2019 in max_number_supplier.columns else None
    max_number_supplier_2019_value = max_number_supplier[2019].max() if 2019 in max_number_supplier.columns else None


    # supplier with the largest value of contracts
    max_value_supplier = pd.pivot_table(data=data_sub.query("supplier_dummy == 1"), 
                                        columns="lot_announce_year", index="participant_code", values="lot_final_value", 
                                        aggfunc="sum")

    max_value_supplier_2017_name = max_value_supplier[2017].idxmax() if 2017 in max_value_supplier.columns else None
    max_value_supplier_2017_value = max_value_supplier[2017].max() if 2017 in max_value_supplier.columns else None

    max_value_supplier_2019_name = max_value_supplier[2019].idxmax() if 2019 in max_value_supplier.columns else None
    max_value_supplier_2019_value = max_value_supplier[2019].max() if 2019 in max_value_supplier.columns else None


    # organizer with the largest number of contracts
    max_number_organizer = pd.pivot_table(data=data_sub.query("supplier_dummy == 1"), 
                                         columns="lot_announce_year", index="organizer_code", values="lot_id", 
                                         aggfunc="nunique")

    max_number_organizer_2017_name = max_number_organizer[2017].idxmax() if 2017 in max_number_organizer.columns else None
    max_number_organizer_2017_value = max_number_organizer[2017].max() if 2017 in max_number_organizer.columns else None

    max_number_organizer_2019_name = max_number_organizer[2019].idxmax() if 2019 in max_number_organizer.columns else None
    max_number_organizer_2019_value = max_number_organizer[2019].max() if 2019 in max_number_organizer.columns else None


    # organizer with the largest value of contracts
    max_value_organizer = pd.pivot_table(data=data_sub.query("supplier_dummy == 1"), 
                                        columns="lot_announce_year", index="organizer_code", values="lot_final_value", 
                                        aggfunc="sum")

    max_value_organizer_2017_name = max_value_organizer[2017].idxmax() if 2017 in max_value_organizer.columns else None
    max_value_organizer_2017_value = max_value_organizer[2017].max() if 2017 in max_value_organizer.columns else None

    max_value_organizer_2019_name = max_value_organizer[2019].idxmax() if 2019 in max_value_organizer.columns else None
    max_value_organizer_2019_value = max_value_organizer[2019].max() if 2019 in max_value_organizer.columns else None


    # Herfindahl–Hirschman index suppliers
    hirsh_suppliers = calc_hirshman_per_year(data_sub.query("supplier_dummy == 1"), "participant_code")


    # Herfindahl–Hirschman index organizers region
    hirsh_org_region = calc_hirshman_per_year(data_sub.query("supplier_dummy == 1"), "organizer_region")


    # sum of and number of contracts wone by suppliers from region other than organizers
    other_reg_value = data_sub.query("organizer_region != participant_region and supplier_dummy == 1")['lot_final_value'].sum()
    other_reg_number = data_sub.query("organizer_region != participant_region and supplier_dummy == 1")['lot_id'].nunique()


    # collect all metrics into dataframe
    market_descr = pd.DataFrame([part_persistent, org_persistent, 

                                 max_number_supplier_2017_name, max_number_supplier_2017_value,
                                 max_number_supplier_2019_name, max_number_supplier_2019_value, 
                                 max_value_supplier_2017_name, max_value_supplier_2017_value,
                                 max_value_supplier_2019_name, max_value_supplier_2019_value,

                                 max_number_organizer_2017_name, max_number_organizer_2017_value,
                                 max_number_organizer_2019_name, max_number_organizer_2019_value, 
                                 max_value_organizer_2017_name, max_value_organizer_2017_value,
                                 max_value_organizer_2019_name, max_value_organizer_2019_value,

                                 hirsh_suppliers[2017][0], hirsh_suppliers[2019][0],
                                 hirsh_org_region[2017][0], hirsh_org_region[2019][0],

                                 other_reg_value, other_reg_number]).T

    market_descr.columns = ['part_pers_number', 'org_pers_number',

                            'supplier_max_contracts_2017_id', 'supplier_max_contracts_2017',
                            'supplier_max_contracts_2019_id', 'supplier_max_contracts_2019',
                            'supplier_max_value_2017_id', 'supplier_max_value_2017',
                            'supplier_max_value_2019_id', 'supplier_max_value_2019',

                            'org_max_contracts_2017_id', 'org_max_contracts_2017',
                            'org_max_contracts_2019_id', 'org_max_contracts_2019',
                            'org_max_value_2017_id', 'org_max_value_2017',
                            'org_max_value_2019_id', 'org_max_value_2019',

                            'hirsh_suppliers_2017', 'hirsh_suppliers_2019',
                            'hirsh_regions_2017', 'hirsh_regionss_2019',

                            'other_region_value', 'other_region_number']

    market_descr.index = [market]
    market_descr.index.name = 'lot_cpv_4_digs'
    
    if df_summary is not None:
        return pd.concat([df_summary, market_descr])
    else:
        return market_descr

In [None]:
first_market = True
counter = 0

for market in data_comp.lot_cpv_4_digs.unique():
    
    print(counter, market)
    df_sub = data_comp.query(f'lot_cpv_4_digs == "{market}"')
    
    if first_market:
        df_summary = analyze_market(df_sub, market)
        first_market = False
    else:
        df_summary = analyze_market(df_sub, market, df_summary)
        
    counter += 1

In [None]:
# combine two dataframes with metrics
df_metrics = pd.merge(df_metrics, df_summary, left_index=True, right_index=True)
df_metrics.to_csv('market_metrics.csv')

display(df_metrics.head().T)

# Sand box (1-mode network)

In [None]:
# Let's create the function  for creating the network
def making_graph_1_mode(df):
    """The function takes the df and creates the 1-mode network oа tender participants.
       Node - tender participant, edge - participation in tender organized by particular public entity.
       For example, the two companies are connected if they both particpated in tender organized by one public entity"""
    
    # Create the table where columns are public entities codes (organizer_code) and index - tender participants codes (participant_code)
    df = df.pivot_table(values = "lot_final_value", index="participant_code", columns="organizer_code", aggfunc="count").fillna(0)
    # Dot product that 'connects' all the participants
    df = df.dot(df.T)
    # Simplification of the received matrix
    df = df.astype(int)
    np.fill_diagonal(df.values, 0)
    # Create the graph from the received adjacency matrix
    G = nx.from_pandas_adjacency(df)
    
    return G

In [None]:
# df_graphs = data_comp.groupby([data_comp["lot_cpv_4_digs"], data_comp["lot_announce_year"]]).apply(making_graph_1_mode).reset_index()
# df_graphs.rename(columns={0:"graph"}, inplace=True)

# print("The shape of the DF:", df_graphs.shape)
# df_graphs.head(3)

In [None]:
# def clustering_measures_1_mode(G):
#     try:
#         average_degree_centrality = np.mean(list(nx.degree_centrality(G).values()))
#     except:
#         average_degree_centrality = np.nan
        
#     try:
#         average_betweenness_centrality = np.mean(list(nx.betweenness_centrality(G).values()))
#     except:
#         average_betweenness_centrality = np.nan
    
#     # Components

#     comp_list = sorted(nx.connected_components(G), key=len, reverse=True) 
#     comp_size = [len(comp) for comp in comp_list]
    
#     G_largest_comp = G.subgraph(comp_list[0])
#     edges_largest_comp = G_largest_comp.number_of_edges()
#     centrality_largest_node_largest_comp = max(nx.degree_centrality(G_largest_comp).values())

#     max_node_largest_comp = max(nx.degree_centrality(G_largest_comp))
#     num_edges_largest_node_largest_component = G_largest_comp.degree(max_node_largest_comp)
    
#     return [G.number_of_nodes(), G.number_of_edges(), average_degree_centrality,
#             nx.average_clustering(G), average_betweenness_centrality, nx.transitivity(G),
#             nx.number_connected_components(G), comp_size[0], comp_size[0]/G.number_of_nodes(), np.mean(comp_size), np.median(comp_size),
#             edges_largest_comp, centrality_largest_node_largest_comp, num_edges_largest_node_largest_component]


In [None]:
# df_graph_measures_1_mode = df_graphs["graph"].apply(clustering_measures_1_mode)
# df_graph_measures_1_mode = pd.DataFrame((df_graph_measures_1_mode.tolist()), columns = ["number_of_nodes", "number_of_edges", "average_degree_centrality",
#                                                                                         "average_clustering", "average_betweenness_centrality", "transitivity",
#                                                                                         "number_connected_components", "size_largest_component", "share_largest_component",
#                                                                                         "average_component_size", "median_compomemt_size", 
#                                                                                         "edges_largest_comp", "centrality_largest_node_largest_comp", "num_edges_largest_node_largest_component"])


# print("The shape of the DF:", df_graph_measures_1_mode.shape)
# df_graph_measures_1_mode.head()