In [1]:
import pandas as pd
import numpy as np
from mappings import map_country_id_to_country_name, map_country_name_to_country_id, import_country_mapping

Load training data and determine the unique countries

In [2]:
feature_set = 1
year = ""

In [3]:
# Load prepared data
df = pd.read_parquet(fr"C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\cm_features_allyears_feature_set{feature_set}{year}.parquet")

In [4]:
country_mapping = import_country_mapping()

In [5]:
# Create a list of countries based on the country_id column using map_country_id_to_country_name
country_ids = df['country_id'].unique()
countries = list(map(map_country_id_to_country_name, country_ids))
# Create dataframe with the country_ids and the corresponding country names
country_df = pd.DataFrame({'country_id': country_ids, 'country': countries})


Load neighbors data and filter it on countries in training data and current spatial dependency (relations between existing countries at the present time)

In [6]:
# Load neighbors data
neighbors_data = pd.read_csv(r"C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\shared_competition_data\country_neighbors\fallback3_prod_country_country_month.csv")
# Rename columns as column names arbitrarily due to csv file
neighbors_data = neighbors_data.rename(columns={'59': 'country_id_a', 'Sudan': 'country_a', '70': 'country_id_b', 'Central African Republic': 'country_b', '[0,379)': 'month_id_period'})
# Drop remaining columns which are not needed
neighbors_data = neighbors_data.drop(columns=['1', '625', '482'])

In [7]:
# Drop all entries where month_id_period does not end with "612)"
neighbors_data = neighbors_data[neighbors_data['month_id_period'].str.endswith("612)")]

In [8]:
# Filter neighbors_data to include only countries that are in the list "countries"
neighbors_data_filtered = neighbors_data[neighbors_data['country_a'].isin(countries) & neighbors_data['country_b'].isin(countries)]

In [9]:
# For names in "countries" which are not in column "country_a" or "country_b" of neighbors_data_filtered, create an entry in the dataframe neighbors_data_filtered with the missing country name, id in column "country_a", "country_id_a" and nan in column "country_b", "country_id_b" and vice versa
# This will comprise countries for which there is no neighboring relationship in the neighbors_data (e.g. islands) or countries which do have neighbors but those neighbors are not in the list "countries"
for country in countries:
    if country not in neighbors_data_filtered['country_a'].unique():
        neighbors_data_filtered = neighbors_data_filtered.append({'country_id_a': map_country_name_to_country_id(country), 'country_a': country, 'country_id_b': None, 'country_b': None, 'month_id_period': None}, ignore_index=True)
    if country not in neighbors_data_filtered['country_b'].unique():
        neighbors_data_filtered = neighbors_data_filtered.append({'country_id_a': None, 'country_a': None, 'country_id_b': map_country_name_to_country_id(country), 'country_b': country, 'month_id_period': None}, ignore_index=True)

In [10]:
# Get the unique country names from the column "country_a" of the neighbors_data_filtered dataframe
neighbors_data_filtered_countries_a = neighbors_data_filtered['country_a'].unique()
# Get the unique country names from the column "country_b" of the neighbors_data_filtered dataframe
neighbors_data_filtered_countries_b = neighbors_data_filtered['country_b'].unique()

In [11]:
def create_adjacency_matrix(df):
    # Get unique country IDs
    country_ids = sorted(set(df['country_id_a'].dropna()) | set(df['country_id_b'].dropna()))
    
    # Initialize the adjacency matrix with zeros
    num_countries = len(country_ids)
    adjacency_matrix = np.zeros((num_countries, num_countries))
    
    # Fill the adjacency matrix based on the given conditions
    for index, row in df.iterrows():
        country_id_a, country_id_b = row['country_id_a'], row['country_id_b']
        
        # If both countries have IDs, update the matrix (otherwise it contains 0s)
        if not pd.isna(country_id_a) and not pd.isna(country_id_b):
            index_a, index_b = country_ids.index(country_id_a), country_ids.index(country_id_b)
            adjacency_matrix[index_a, index_b] = -1  # s ≠ r, s ∼ r
            adjacency_matrix[index_b, index_a] = -1  # s ≠ r, s ∼ r
            adjacency_matrix[index_a, index_a] += 1  # Increase diagonal for country a
            adjacency_matrix[index_b, index_b] += 1  # Increase diagonal for country b
            
    # Divide the diagonal by 2 to get the number of neighbors
    for i in range(num_countries):
        adjacency_matrix[i, i] /= 2
    
    # Set all values in the matrix to integers
    adjacency_matrix = adjacency_matrix.astype(int)
    return adjacency_matrix, country_ids

In [12]:
adjacency_matrix, country_ids = create_adjacency_matrix(neighbors_data_filtered)

In [13]:
# Convert country_ids to integers
country_ids = [int(country_id) for country_id in country_ids]
file_path = fr"C:\Users\Uwe Drauz\Documents\bachelor_thesis_local\personal_competition_data\data\adjacency_matrix_non_zero_and_actual_countries_feature_set{feature_set}{year}.csv"
# Save the adjacency matrix to a csv file
adjacency_matrix_df = pd.DataFrame(adjacency_matrix, index=country_ids, columns=country_ids)
pd.DataFrame.to_csv(adjacency_matrix_df, file_path)

Reparametricing adjacency matrix

Check id country ids are the same

In [18]:
# Join the columns country_id of country_mapping on the columns country_a of neighbors_data_filtered based on the column "name"
neighbors_data_filtered = neighbors_data_filtered.merge(country_mapping[['name', 'country_id']], left_on='country_a', right_on='name', how='left')
neighbors_data_filtered = neighbors_data_filtered.merge(country_mapping[['name', 'country_id']], left_on='country_b', right_on='name', how='left')

In [19]:
neighbors_data_filtered.drop(columns=['name_x', 'name_y'], inplace=True)
# Cast country_id_x and country_id_y to int, ignore nan values
neighbors_data_filtered['country_id_x'] = neighbors_data_filtered['country_id_x'].astype('Int64')
neighbors_data_filtered['country_id_y'] = neighbors_data_filtered['country_id_y'].astype('Int64')
# Create two binary columns which compare the values of country_id_a and country_id_x and country_id_b and country_id_y
neighbors_data_filtered['same_country_id_a'] = neighbors_data_filtered['country_id_a'] == neighbors_data_filtered['country_id_x']
neighbors_data_filtered['same_country_id_b'] = neighbors_data_filtered['country_id_b'] == neighbors_data_filtered['country_id_y']
# Rearrange columns
neighbors_data_filtered = neighbors_data_filtered[['country_id_a', 'country_id_x', 'same_country_id_a', 'country_a', 'country_id_b', 'country_id_y', 'same_country_id_b', 'country_b', 'month_id_period']]

In [20]:
# Show entries where same_country_id_a or same_country_id_b aren't true
neighbors_data_filtered[~(neighbors_data_filtered['same_country_id_a'] | neighbors_data_filtered['same_country_id_b'])]

Unnamed: 0,country_id_a,country_id_x,same_country_id_a,country_a,country_id_b,country_id_y,same_country_id_b,country_b,month_id_period


Perform checks to investigate contents of the filtered data

In [22]:
# Convert to dataframe
neighbors_data_filtered_countries_a_df = pd.DataFrame(neighbors_data_filtered_countries_a, columns=['country'])
# Convert to dataframe
neighbors_data_filtered_countries_b_df = pd.DataFrame(neighbors_data_filtered_countries_b, columns=['country'])

In [39]:
# Show me which countries of the list "countries" are not in the neighbors_data_filtered_countries_a list
for country in countries:
    if country not in neighbors_data_filtered_countries_a:
        print(country)

Trinidad and Tobago
Sri Lanka
Philippines
Comoros
Australia
Solomon Is.


In [40]:
# Show me which countries of the list "countries" are not in the neighbors_data_filtered_countries_b list
for country in countries:
    if country not in neighbors_data_filtered_countries_b:
        print(country)

Trinidad and Tobago
Sri Lanka
Philippines
Comoros
Australia
Solomon Is.


In [41]:
# Show me which countries of the neighbors_data_filtered_countries_a list are not in the list "countries"
for country in neighbors_data_filtered_countries_a:
    if country not in countries:
        print(country)

None


In [42]:
# Show me which countries of the neighbors_data_filtered_countries_b list are not in the list "countries"
for country in neighbors_data_filtered_countries_b:
    if country not in countries:
        print(country)

None


In [13]:
# Calculate rank of adjacency matrix
np.linalg.matrix_rank(adjacency_matrix)

80

In [20]:
# Calculate eigenvalues of adjacency matrix
eigenvalues, eigenvectors = np.linalg.eig(adjacency_matrix)

# convert eigenvalues to list
eigenvalues = list(eigenvalues)
# change type of eigenvalues to float
eigenvalues = [float(eigenvalue) for eigenvalue in eigenvalues]

# Calulate the sum over all eigenvalues
sum_eigenvalues = sum(eigenvalues); sum_eigenvalues


  import sys


300.00000000000006

In [21]:
# Caluclate the sum over rows and columns of the adjacency matrix
sum_rows = np.sum(adjacency_matrix, axis=1)
sum_columns = np.sum(adjacency_matrix, axis=0)

In [22]:
# Drop the columns/ rows of the adjacency matrix in which all entries are zeros
adjacency_matrix_only_neighbors = adjacency_matrix[~np.all(adjacency_matrix == 0, axis=1)]
adjacency_matrix_only_neighbors = adjacency_matrix_only_neighbors[:, ~np.all(adjacency_matrix_only_neighbors == 0, axis=0)]

In [23]:
# Calculate rank of adjacency_matrix_only_neighbors
np.linalg.matrix_rank(adjacency_matrix_only_neighbors)

80

In [24]:
# Add 1 to all diagonal entries of adjacency_matrix
adjacency_matrix_diagonal_one = adjacency_matrix + np.identity(adjacency_matrix.shape[0])


In [27]:
# Calculate rank of adjacency_matrix_diagonal_one
np.linalg.matrix_rank(adjacency_matrix_diagonal_one)

96

In [26]:
# Caluclate the sum over rows and columns of the adjacency matrix
sum_rows_diagonal_one = np.sum(adjacency_matrix_diagonal_one, axis=1)
sum_columns_diagonal_one = np.sum(adjacency_matrix_diagonal_one, axis=0)

In [13]:
from mappings import map_month_id_to_datetime

In [14]:
map_month_id_to_datetime(493)

datetime.datetime(2021, 1, 1, 0, 0)