# Cultural Network

## Potential Datasets

- [Cultural Distance](https://world.culturalytics.com/table?countryA=&countryB=&countries=All&dimension=All&question&years=2010-2014&years=2005-2009&confidenceInterval=false&level=dimension&search=&appearance=flag-name) : many different dimensions like Altruism, political
- [Airplane Data](https://opensky-network.org/data)

## Create Network Dataset

In [10]:
import pandas as pd
import numpy as np
import os
import country_converter as coco
import re

In [18]:
data_folder = 'data/cultural_distance'
data_paths = [os.path.join(data_folder, path) for path in os.listdir(data_folder)]

data_paths

data_frames = {}

for path in data_paths:
    match = re.search(r'99-countries-(.*?)-1981', path)
    string_match = match.group(1)
    if match and path.endswith('table.csv'):
        df_temp = pd.read_csv(path)
        data_frames[string_match.replace('-', '_')] = df_temp

In [43]:
edge_lists = []

for key in list(data_frames.keys()):
    df_temp = data_frames[key]
    
    df_edge_list = df_temp.set_index('Name')\
        .rename_axis('country_a')\
        .reset_index()\
        .melt('country_a', value_name='distance', var_name='country_b')\
        .reset_index(drop=True)
            
    df_edge_list['year_a'] = df_edge_list['country_a'].str.extract(r'(\d{4}-\d{4})')
    df_edge_list['year_b'] = df_edge_list['country_b'].str.extract(r'(\d{4}-\d{4})')
    
    df_edge_list['country_b'] = df_edge_list['country_b'].str.replace(r'\d{4}-\d{4}', '', regex=True)
    df_edge_list['country_a'] = df_edge_list['country_a'].str.replace(r'\d{4}-\d{4}', '', regex=True)
    
    df_edge_list['year_max_a'] = df_edge_list['year_a'].str.extract(r'-(\d{4})').astype(int)
    df_edge_list['year_max_b'] = df_edge_list['year_b'].str.extract(r'-(\d{4})').astype(int)
    df_edge_list['year_min_a'] = df_edge_list['year_a'].str.extract(r'(\d{4})').astype(int)
    df_edge_list['year_min_b'] = df_edge_list['year_b'].str.extract(r'(\d{4})').astype(int)
    
    df_edge_list_year_combined = pd.DataFrame()
    
    for year in df_edge_list['year_max_a'].unique():
        df_edge_list_year = df_edge_list[(df_edge_list['year_max_a'] == year)&(df_edge_list['year_max_b'] == year)]
    
        # df_edge_list = df_edge_list.sort_values(by=['year_max_a', 'year_max_b'], ascending=False).drop_duplicates(subset=['country_a', 'country_b'], keep='first')
        
        df_edge_list_year = df_edge_list_year.rename(columns={'distance':key})
        
        df_edge_list_final = df_edge_list_year[['country_a', 'country_b', key, 'year_max_a', 'year_min_a', 'year_max_b', 'year_min_b']]
        
        df_edge_list_final = df_edge_list_final[df_edge_list_final['country_a']!=df_edge_list_final['country_b']]
        
        df_edge_list_year_combined = pd.concat([df_edge_list_year_combined, df_edge_list_final], ignore_index=True)
        
    edge_lists.append(df_edge_list_year_combined)

In [44]:
df = edge_lists[0]

for d in edge_lists[1:]:
    df = df.merge(d, how='outer', on=['country_a', 'country_b', 'year_max_a', 'year_min_a', 'year_max_b', 'year_min_b'])

    df = df[(df['country_a'] != 'Serbia and Montenegro') & (df['country_b'] != 'Serbia and Montenegro')]

# Use coco.convert with a vectorized approach for efficiency
df['ISO3_a'] = coco.convert(df['country_a'].tolist(), to='ISO3')
df['ISO3_b'] = coco.convert(df['country_b'].tolist(), to='ISO3')

In [45]:
years = [range(row['year_min_a'], row['year_max_a'] + 1) for _, row in df.iterrows()]
lens = [len(r) for r in years]
df_repeated = df.loc[df.index.repeat(lens)].copy()
df_repeated['year'] = np.concatenate([np.array(list(r)) for r in years])
df_year_all = df_repeated.reset_index(drop=True)

In [46]:
df_year_all

Unnamed: 0,country_a,country_b,36_dimensions,year_max_a,year_min_a,year_max_b,year_min_b,altruism,authoritarianism,economy,egalitarianism,finance,liberitarianism,neoliberalism_capitalism,norms_tradition,politics,ISO3_a,ISO3_b,year
0,Albania,Algeria,0.0970,2004,1999,2004,1999,0.0587,0.0431,0.0475,0.0749,0.0968,0.1216,0.0678,0.0856,0.0689,ALB,DZA,1999
1,Albania,Algeria,0.0970,2004,1999,2004,1999,0.0587,0.0431,0.0475,0.0749,0.0968,0.1216,0.0678,0.0856,0.0689,ALB,DZA,2000
2,Albania,Algeria,0.0970,2004,1999,2004,1999,0.0587,0.0431,0.0475,0.0749,0.0968,0.1216,0.0678,0.0856,0.0689,ALB,DZA,2001
3,Albania,Algeria,0.0970,2004,1999,2004,1999,0.0587,0.0431,0.0475,0.0749,0.0968,0.1216,0.0678,0.0856,0.0689,ALB,DZA,2002
4,Albania,Algeria,0.0970,2004,1999,2004,1999,0.0587,0.0431,0.0475,0.0749,0.0968,0.1216,0.0678,0.0856,0.0689,ALB,DZA,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59079,Zimbabwe,Yemen,0.0746,2014,2010,2014,2010,0.0594,0.0412,0.0750,0.0904,0.0651,0.1059,0.0933,0.0846,0.0451,ZWE,YEM,2010
59080,Zimbabwe,Yemen,0.0746,2014,2010,2014,2010,0.0594,0.0412,0.0750,0.0904,0.0651,0.1059,0.0933,0.0846,0.0451,ZWE,YEM,2011
59081,Zimbabwe,Yemen,0.0746,2014,2010,2014,2010,0.0594,0.0412,0.0750,0.0904,0.0651,0.1059,0.0933,0.0846,0.0451,ZWE,YEM,2012
59082,Zimbabwe,Yemen,0.0746,2014,2010,2014,2010,0.0594,0.0412,0.0750,0.0904,0.0651,0.1059,0.0933,0.0846,0.0451,ZWE,YEM,2013


In [47]:
df_year_all.to_parquet('data/culture_distance.parquet', index=False)

## Node Features

- Hofstede Culture Dataset

- Religion
- Ethnology

### Hofstede

In [56]:
df_hofstede = pd.read_csv('data/6-dimensions-for-website-2015-08-16.csv', sep=';')
df_hofstede['ISO3'] = coco.convert(df_hofstede['country'].tolist(), to='ISO3')
df_hofstede = df_hofstede[df_hofstede['ISO3'].notna()]
years = list(range(2000, pd.Timestamp.now().year + 1))
countries = df_hofstede[df_hofstede['ISO3'] != 'not found']['ISO3'].unique()

hofstede_years = pd.DataFrame([(country, year) for country in countries for year in years], columns=['ISO3', 'year'])
df_hofstede_years = hofstede_years.merge(df_hofstede, on='ISO3', how='left')
df_hofstede_years_final = df_hofstede_years[['ISO3', 'year', 'pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr']]

Africa East not found in regex
Africa West not found in regex
Arab countries not found in regex
Germany East not found in regex


### Religion

In [58]:
df_religion.columns

Index(['Region', 'Country', 'Year', 'Population', 'Christians', 'Muslims',
       'Religiously_unaffiliated', 'Buddhists', 'Hindus', 'Jews',
       'Other_religions', 'Level', 'Countrycode', 'ISO3'],
      dtype='object')

In [63]:
religions_list = ['Christians', 'Muslims', 'Religiously_unaffiliated', 'Buddhists', 'Hindus', 'Jews', 'Other_religions']

df_religion = pd.read_csv('data/Religious Composition 2010-2020 (percentages).csv')
df_religion = (df_religion[df_religion['Level']==1])
df_religion['ISO3'] = coco.convert(df_religion['Country'].tolist(), to='ISO3')
df_religion = df_religion[df_religion['ISO3'].notna()]

df_religion_final = df_religion[['ISO3', 'Year','Christians', 'Muslims',
       'Religiously_unaffiliated', 'Buddhists', 'Hindus', 'Jews',
       'Other_religions']]


Channel Islands not found in regex
Channel Islands not found in regex


In [64]:
df_religion_final

Unnamed: 0,ISO3,Year,Christians,Muslims,Religiously_unaffiliated,Buddhists,Hindus,Jews,Other_religions
14,AFG,2010,0.100224,99.749184,0.008652,0.019986,0.000355,0.000106,0.121495
15,AFG,2020,0.019379,99.861969,0.008457,0.020001,0.000128,0.000026,0.090043
16,ALB,2010,20.283470,70.152901,9.537429,0.000000,0.000000,0.009594,0.016603
17,ALB,2020,17.815659,74.507225,7.652887,0.000158,0.000708,0.010050,0.013316
18,DZA,2010,0.290679,98.571457,1.084724,0.014110,0.000000,0.000152,0.038876
...,...,...,...,...,...,...,...,...,...
411,YEM,2020,0.055731,99.863022,0.061555,0.000547,0.007744,0.000202,0.011201
412,ZMB,2010,97.682922,0.505516,0.079393,0.002175,0.018199,0.001233,1.710560
413,ZMB,2020,98.280380,0.511289,0.059754,0.001902,0.016061,0.001004,1.129608
414,ZWE,2010,84.445267,0.502715,12.482901,0.043459,0.048872,0.003115,2.473672


### Nationalities

In [52]:
df_nationalities = pd.read_csv('data/UNdata_Export_20250709_153145548.csv', sep=',')

In [13]:
df_nationalities = df_nationalities[(df_nationalities['Sex'] == 'Both Sexes') & 
                                    (df_nationalities['Area'] == 'Total')]
df_nationalities

Unnamed: 0,Country or Area,Year,Area,Sex,National and/or ethnic group,Record Type,Reliability,Source Year,Value,Value Footnotes
0,Åland Islands,2000,Total,Both Sexes,Total,Census - de jure - complete tabulation,"Final figure, complete",2009.0,25776.0,1
1,Åland Islands,2000,Total,Both Sexes,Finnish,Census - de jure - complete tabulation,"Final figure, complete",2009.0,5109.0,1
2,Åland Islands,2000,Total,Both Sexes,Swedish,Census - de jure - complete tabulation,"Final figure, complete",2009.0,1354.0,1
3,Åland Islands,2000,Total,Both Sexes,Other,Census - de jure - complete tabulation,"Final figure, complete",2009.0,552.0,1
4,Åland Islands,2000,Total,Both Sexes,Åland,Census - de jure - complete tabulation,"Final figure, complete",2009.0,18682.0,1
...,...,...,...,...,...,...,...,...,...,...
58616,Zambia,2000,Total,Both Sexes,African,Census - de facto - complete tabulation,"Final figure, complete",2007.0,9294154.0,
58617,Zambia,2000,Total,Both Sexes,American,Census - de facto - complete tabulation,"Final figure, complete",2007.0,1198.0,
58618,Zambia,2000,Total,Both Sexes,Asians,Census - de facto - complete tabulation,"Final figure, complete",2007.0,11848.0,
58619,Zambia,2000,Total,Both Sexes,European,Census - de facto - complete tabulation,"Final figure, complete",2007.0,6182.0,


In [14]:
df_nationalities[df_nationalities['Country or Area'] == 'Switzerland']

Unnamed: 0,Country or Area,Year,Area,Sex,National and/or ethnic group,Record Type,Reliability,Source Year,Value,Value Footnotes


### Language

In [6]:
import geopandas as gpd
import pandas as pd

In [10]:
df_language = pd.read_csv('data/language_dataset.csv')