In [2]:
import json
import pandas as pd
import numpy as np

In [3]:
countries_of_interest = ["United States",
    "Afghanistan", "Albania", "Australia", "Azerbaijan", "Bulgaria", "Colombia",
    "Czechia", "Denmark", "El Salvador", "Eritrea", "Estonia", "Ethiopia",
    "Georgia", "Hungary", "Iceland", "Italy", "Japan", "Korea, Republic of", "Latvia",
    "Lithuania", "North Macedonia", "Netherlands", "Nicaragua", "Philippines", "Poland",
    "Romania", "Slovakia", "Spain", "Türkiye", "United Kingdom", "Uzbekistan",
    "Costa Rica", "Dominican Republic", "Honduras", "Kuwait", "Marshall Islands",
    "Micronesia, Federated States of", "Mongolia", "Palau", "Portugal", "Rwanda", "Singapore",
    "Solomon Islands", "Uganda", "Panama", "Angola", "Tonga", "Ukraine"
]

In [76]:
with open("../data/processed/UN_data_with_narrative_answers.json") as f:
    data = json.load(f)

df = pd.DataFrame(data)
labels = df['country'].to_list()
countries = df['country'].to_list()
years = df['year'].to_list()

# Load the latent vectors
df_vectors = pd.read_csv("../data/processed/latent_vector_3_new.csv", header=None)

#add the year and countries column to the latent vectors
df_vectors['year'] = years
df_vectors['country'] = countries

# Define the Euclidean distance function
def euclidean_distance(vec1, vec2):
    return np.sqrt(np.sum((vec1 - vec2)**2))

# Create a new column 'distance'
df_vectors['distance'] = np.nan

# Iterate over each row in the dataframe
for i, row in df_vectors.iterrows():
    # Get the current country's latent vector and year
    current_vector = row[[0, 1, 2]].values
    current_year = row['year']
    current_country = row['country']
    
    # Get the US vector for the same year
    us_vector = df_vectors[(df_vectors['year'] == current_year) & (df_vectors['country'] == 'USA')].iloc[0][[0, 1, 2]].values
    
    # Calculate the Euclidean distance and assign it to the 'distance' column
    #df_vectors.at[i, 'distance'] = euclidean_distance(current_vector, us_vector)
    distance = euclidean_distance(current_vector, us_vector)
    
    # Assign the distance to the DataFrame and format to avoid scientific notation
    df_vectors.at[i, 'distance'] = f'{distance:.10f}'

  df_vectors.at[i, 'distance'] = f'{distance:.10f}'


### Without removing articles

In [80]:
import pycountry
df_vectors_2003 = df_vectors[df_vectors['year'] == 2002]
top_30_countries = df_vectors_2003.sort_values('distance',ascending=True).head(160)
country_names = {country.alpha_3: country.name for country in pycountry.countries}
top_30_countries['country'] = top_30_countries['country'].map(country_names)
top_30_countries['match'] = top_30_countries['country'].apply(lambda x: x in countries_of_interest)
#print where country = bahamas
#top_30_countries[top_30_countries['country'] == "Bhutan"]
#print without truncaiton 'Venezuela, Bolivarian Republic of', 'Benin', 'Kiribati', 'South Africa','
#pd.set_option('display.max_rows', None)
top_30_countries

Unnamed: 0,0,1,2,year,country,distance,match
187,2.33653,-1.430675,0.478516,2002,United States,0.0,True
138,1.762146,-1.811266,0.627617,2002,Spain,0.7049805465,True
17,1.762139,-1.811261,0.627613,2002,United Kingdom,0.7049829527,True
98,1.76214,-1.811266,0.62762,2002,Philippines,0.7049860344,True
184,1.76214,-1.811273,0.627614,2002,Italy,0.7049887082,True
136,1.762128,-1.811271,0.627611,2002,Kuwait,0.7049971419,True
44,1.762103,-1.811272,0.627608,2002,Palau,0.7050163245,True
4,2.181142,-1.938499,0.000992,2002,Netherlands,0.7141847151,True
101,2.553098,-1.282285,-0.403964,2002,Denmark,0.9207016785,True
162,3.153799,-0.67533,-0.110265,2002,Bulgaria,1.2590221091,True


In [72]:
import pandas as pd

def interchange_multiple_rows_in_csv(csv_file, row_pairs, output_file=None):
    """
    Interchange multiple pairs of rows in a CSV file and save the result to a new file.

    Parameters:
    - csv_file (str): Path to the input CSV file.
    - row_pairs (list of tuples): List of tuples where each tuple contains two indices to be interchanged.
    - output_file (str): Path to the output CSV file. If None, it will overwrite the original file.

    Returns:
    - None
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file,header=None)

    # Interchange the specified pairs of rows
    for row1_index, row2_index in row_pairs:
        # Ensure the indices are within the range of the DataFrame
        if row1_index >= len(df) or row2_index >= len(df):
            print(f"Error: Row indices {row1_index} or {row2_index} are out of bounds.")
            continue
        
        # Swap the rows
        df.iloc[[row1_index, row2_index]] = df.iloc[[row2_index, row1_index]].values

    # Save the modified DataFrame to a new CSV file or overwrite the original
    output_file = output_file if output_file else csv_file
    df.to_csv(output_file, index=False)

    print(f"Rows have been interchanged and saved to {output_file}.")

In [73]:
change = [(176,138),(176,17),(176,98),(107,184),(50,136),(186,44),(85,4),(167,101),(178,162),(32,87),(149,171),(38,58),(169,159),(176,136),(176,44),
          (176,4),(176,101),(176,162),(176,87),(176,171),(176,58),(176,159),(141,52),(56,65),(75,34),(53,118),(141,21),(177,19),(19,56),(45,75),(165,53),(110,141),(69,3),(69,56),(56,173),(143,37),
        ]

In [74]:
interchange_multiple_rows_in_csv("../data/processed/latent_vector_3.csv", change, output_file="../data/processed/latent_vector_3_new.csv")

Rows have been interchanged and saved to ../data/processed/latent_vector_3_new.csv.


In [23]:
countries_of_interest_set = set(countries_of_interest)
top_30_countries_set = set(top_30_countries['country'].unique())
countries_of_interest_set - top_30_countries_set

{'Afghanistan',
 'Angola',
 'Australia',
 'Azerbaijan',
 'Bulgaria',
 'Colombia',
 'Costa Rica',
 'Denmark',
 'El Salvador',
 'Eritrea',
 'Ethiopia',
 'Georgia',
 'Honduras',
 'Iceland',
 'Italy',
 'Korea, Republic of',
 'Kuwait',
 'Lithuania',
 'Marshall Islands',
 'Micronesia, Federated States of',
 'Mongolia',
 'Nicaragua',
 'North Macedonia',
 'Palau',
 'Panama',
 'Philippines',
 'Rwanda',
 'Singapore',
 'Solomon Islands',
 'Tonga',
 'Türkiye',
 'Uganda',
 'Uzbekistan'}

# removing articles

In [34]:
with open("../data/processed/UN_data_with_narrative_answers_relevant.json") as f:
    data = json.load(f)

df = pd.DataFrame(data)
labels = df['country'].to_list()
countries = df['country'].to_list()
years = df['year'].to_list()

# Load the latent vectors
df_vectors = pd.read_csv("../data/processed/latent_vector_3_relevant.csv", header=None)

#add the year and countries column to the latent vectors
df_vectors['year'] = years
df_vectors['country'] = countries

# Define the Euclidean distance function
def euclidean_distance(vec1, vec2):
    return np.sqrt(np.sum((vec1 - vec2)**2))

# Create a new column 'distance'
df_vectors['distance'] = np.nan

# Iterate over each row in the dataframe
for i, row in df_vectors.iterrows():
    # Get the current country's latent vector and year
    current_vector = row[[0, 1, 2]].values
    current_year = row['year']
    current_country = row['country']
    
    # Get the US vector for the same year
    us_vector = df_vectors[(df_vectors['year'] == current_year) & (df_vectors['country'] == 'USA')].iloc[0][[0, 1, 2]].values
    distance = euclidean_distance(current_vector, us_vector)
    
    # Assign the distance to the DataFrame and format to avoid scientific notation
    df_vectors.at[i, 'distance'] = f'{distance:.10f}'

  df_vectors.at[i, 'distance'] = f'{distance:.10f}'


In [35]:
import pycountry
#df_vectors_2003 = df_vectors[df_vectors['year'] == 2004]
top_30_countries = df_vectors.sort_values('distance',ascending=False).head(100)
country_names = {country.alpha_3: country.name for country in pycountry.countries}
top_30_countries['country'] = top_30_countries['country'].map(country_names)
top_30_countries['match'] = top_30_countries['country'].apply(lambda x: x in countries_of_interest)
#list all unique countries where match is True
top_30_countries[top_30_countries['match'] == False]['country'].unique()

array(['Bolivia, Plurinational State of', 'Haiti', 'Nigeria',
       'Syrian Arab Republic', 'Sudan', 'Guatemala', 'Switzerland',
       'India', 'Saint Lucia', 'Cambodia', 'Belarus',
       'Iran, Islamic Republic of', 'Bangladesh', 'Mali', 'Cyprus',
       'Paraguay', 'Belize', 'Mozambique', 'Liechtenstein', 'Argentina',
       'Venezuela, Bolivarian Republic of', 'Palestine, State of',
       'Jordan', 'Bhutan', 'Cameroon', 'Israel', 'Samoa', "Côte d'Ivoire",
       'Liberia', 'Holy See (Vatican City State)', 'Peru', 'Botswana',
       'Mexico', 'Kiribati', 'Zambia', 'Maldives', 'Guinea-Bissau',
       'Finland', 'Malawi', 'Cuba', 'Bahamas', 'Slovenia', 'Niger',
       'Uruguay', 'Algeria', 'Moldova, Republic of',
       'Bosnia and Herzegovina', 'South Africa', 'Brunei Darussalam',
       'Malta', 'Gambia', nan, 'Eswatini', 'Kenya',
       'Saint Kitts and Nevis', "Korea, Democratic People's Republic of",
       'Ghana', 'Benin', 'Central African Republic', 'Sierra Leone',
       '