In [1]:
import pandas as pd

In [2]:
# df = pd.read_csv("dataset/source-destination-data-sample.csv")
immigrant_df = pd.read_csv("../dataset/immigrants/source-destination-data.csv")

In [3]:
immigrant_df.shape

(28030, 12)

In [4]:
immigrant_df.head()

Unnamed: 0,"Region, development group, country or area of destination",Location code of destination,"Region, development group, country or area of origin",Location code of origin,1990,1995,2000,2005,2010,2015,2020,2024
0,World,900,World,900,153 916 063,163 176 002,174 566 152,192 788 721,221 020 392,250 042 020,275 284 032,304 021 813
1,World,900,Sub-Saharan Africa,1834,14 124 662,15 183 742,14 584 913,16 004 417,18 243 295,22 763 602,27 134 957,30 661 610
2,World,900,Northern Africa and Western Asia,1833,14 986 109,17 216 219,18 728 264,21 198 002,25 429 492,32 508 087,37 196 853,40 529 326
3,World,900,Central and Southern Asia,1831,30 342 957,27 930 630,30 008 559,32 445 580,39 400 759,46 011 893,48 594 959,53 948 417
4,World,900,Eastern and South-Eastern Asia,1832,14 465 509,17 262 816,20 822 011,24 315 849,30 053 666,34 562 856,38 223 520,41 409 235


In [5]:
immigrant_df.rename(columns={
    "Region, development group, country or area of destination": "Source",
    "Region, development group, country or area of origin": "Target"
}, inplace=True)

In [6]:
immigrant_df.head()

Unnamed: 0,Source,Location code of destination,Target,Location code of origin,1990,1995,2000,2005,2010,2015,2020,2024
0,World,900,World,900,153 916 063,163 176 002,174 566 152,192 788 721,221 020 392,250 042 020,275 284 032,304 021 813
1,World,900,Sub-Saharan Africa,1834,14 124 662,15 183 742,14 584 913,16 004 417,18 243 295,22 763 602,27 134 957,30 661 610
2,World,900,Northern Africa and Western Asia,1833,14 986 109,17 216 219,18 728 264,21 198 002,25 429 492,32 508 087,37 196 853,40 529 326
3,World,900,Central and Southern Asia,1831,30 342 957,27 930 630,30 008 559,32 445 580,39 400 759,46 011 893,48 594 959,53 948 417
4,World,900,Eastern and South-Eastern Asia,1832,14 465 509,17 262 816,20 822 011,24 315 849,30 053 666,34 562 856,38 223 520,41 409 235


In [7]:
# Step 2: Reshape from wide to long format
# Assumes year columns are named as strings like "1990", "1995", ..., "2024"
year_columns = ['1990', '1995', '2000', '2005', '2010', '2015', '2020']

In [8]:
year_columns

['1990', '1995', '2000', '2005', '2010', '2015', '2020']

In [9]:
immigrant_edge_df = immigrant_df.melt(
    id_vars=["Source", "Target"],
    value_vars=year_columns,
    var_name="Year",
    value_name="Migrated"
)

In [10]:
immigrant_edge_df.shape

(196210, 4)

In [11]:
immigrant_edge_df.head().to_dict()

{'Source': {0: 'World', 1: 'World', 2: 'World', 3: 'World', 4: 'World'},
 'Target': {0: 'World',
  1: 'Sub-Saharan Africa',
  2: 'Northern Africa and Western Asia',
  3: 'Central and Southern Asia',
  4: 'Eastern and South-Eastern Asia'},
 'Year': {0: '1990', 1: '1990', 2: '1990', 3: '1990', 4: '1990'},
 'Migrated': {0: '153 916 063',
  1: '14 124 662',
  2: '14 986 109',
  3: '30 342 957',
  4: '14 465 509'}}

In [12]:
immigrant_edge_df["Migrated"] = immigrant_edge_df["Migrated"].replace({" ": ""}, regex=True)
# immigrant_edge_df["Migrated"] = immigrant_edge_df["Migrated"].astype(int)
immigrant_edge_df["Year"] = immigrant_edge_df["Year"].astype(int)

In [13]:
immigrant_edge_df.head()

Unnamed: 0,Source,Target,Year,Migrated
0,World,World,1990,153916063
1,World,Sub-Saharan Africa,1990,14124662
2,World,Northern Africa and Western Asia,1990,14986109
3,World,Central and Southern Asia,1990,30342957
4,World,Eastern and South-Eastern Asia,1990,14465509


In [14]:
immigrant_edge_df.to_csv("dataset/migration-edges.csv", index=False)

In [15]:
unique_sources = immigrant_edge_df["Source"].unique()
unique_targets = immigrant_edge_df["Target"].unique()

# Combine and get unique values across both columns
all_unique_ids = pd.Series(list(unique_sources) + list(unique_targets)).unique()

# Create a new DataFrame for nodes with a single 'Id' column
nodes_df = pd.DataFrame({"Id": all_unique_ids, 'Label': all_unique_ids})
# nodes_df['gdp'] = 0

In [16]:
len(unique_sources), len(unique_targets)

(286, 285)

In [17]:
nodes_df.shape

(287, 2)

In [18]:
nodes_df.head()

Unnamed: 0,Id,Label
0,World,World
1,Sub-Saharan Africa,Sub-Saharan Africa
2,Northern Africa and Western Asia,Northern Africa and Western Asia
3,Central and Southern Asia,Central and Southern Asia
4,Eastern and South-Eastern Asia,Eastern and South-Eastern Asia


In [19]:
nodes_df.to_csv("../dataset/migration-nodes.csv", index=False)

In [23]:
gdp_df = pd.read_csv("../dataset/gdp-worldbank/gdp-worldbank-constant-usd.csv")

In [24]:
gdp_df.shape

(11960, 4)

In [25]:
gdp_df.head()

Unnamed: 0,Entity,Code,Year,GDP (constant 2015 US$)
0,Afghanistan,AFG,2000,6206548000.0
1,Afghanistan,AFG,2001,5621148000.0
2,Afghanistan,AFG,2002,7228796000.0
3,Afghanistan,AFG,2003,7867264000.0
4,Afghanistan,AFG,2004,7978516000.0


In [26]:
gdp_df.rename(columns={"Entity": "Target", "GDP (constant 2015 US$)": "gdp"}, inplace=True)

In [27]:
gdp_df["gdp"] = gdp_df["gdp"].astype(int)

In [28]:
# Define the list of desired year columns
year_columns = list(map(int, year_columns))  # Convert to integers to match df_long's Year column

# Filter df_long where Year is in the specified list
filtered_gdp_df = gdp_df[gdp_df["Year"].isin(year_columns)]

In [29]:
filtered_gdp_df.shape

(1509, 4)

In [30]:
filtered_gdp_df.head()

Unnamed: 0,Target,Code,Year,gdp
0,Afghanistan,AFG,2000,6206547500
5,Afghanistan,AFG,2005,8874481000
10,Afghanistan,AFG,2010,15354613000
15,Afghanistan,AFG,2015,19134222000
20,Afghanistan,AFG,2020,20621957000


In [31]:
# filtered_gdp_df.to_csv("dataset/gdp-worldbank/filtered_gdp.csv", index=False)

In [32]:
# Merge both DataFrames on 'Id' and 'Year'
gdp_edge_merged_df = pd.merge(immigrant_edge_df, filtered_gdp_df, on=["Target", "Year"], how="inner")
# merged_df = pd.merge(df_long, filtered_gdp_df, on=["Source", "Id", "Year"], how="inner")

In [33]:
gdp_edge_merged_df.rename(columns={"gdp": "target_country_gdp"}, inplace=True)

In [34]:
gdp_edge_merged_df.head()

Unnamed: 0,Source,Target,Year,Migrated,Code,target_country_gdp
0,World,World,1990,153916063,OWID_WRL,36053260000000
1,Sub-Saharan Africa,World,1990,13802710,OWID_WRL,36053260000000
2,Northern Africa and Western Asia,World,1990,16863681,OWID_WRL,36053260000000
3,Central and Southern Asia,World,1990,25456471,OWID_WRL,36053260000000
4,Eastern and South-Eastern Asia,World,1990,6755128,OWID_WRL,36053260000000


In [35]:
gdp_edge_merged_df = gdp_edge_merged_df[["Source", "Target", "Year", "target_country_gdp", "Migrated"]]

In [36]:
gdp_edge_merged_df.shape

(101562, 5)

In [37]:
gdp_edge_merged_df.head()

Unnamed: 0,Source,Target,Year,target_country_gdp,Migrated
0,World,World,1990,36053260000000,153916063
1,Sub-Saharan Africa,World,1990,36053260000000,13802710
2,Northern Africa and Western Asia,World,1990,36053260000000,16863681
3,Central and Southern Asia,World,1990,36053260000000,25456471
4,Eastern and South-Eastern Asia,World,1990,36053260000000,6755128


In [38]:
gdp_edge_merged_df.to_csv("../dataset/migration-edges.csv", index=False)

In [39]:
gdp_edge_2020_df = gdp_edge_merged_df[gdp_edge_merged_df["Year"]==2020]

In [40]:
gdp_edge_2020_df.shape

(14458, 5)

In [41]:
gdp_edge_2020_df.head()

Unnamed: 0,Source,Target,Year,target_country_gdp,Migrated
87104,World,World,2020,82677380000000,275284032
87105,Sub-Saharan Africa,World,2020,82677380000000,22040895
87106,Northern Africa and Western Asia,World,2020,82677380000000,48992329
87107,Central and Southern Asia,World,2020,82677380000000,17736883
87108,Eastern and South-Eastern Asia,World,2020,82677380000000,21161853


In [42]:
gdp_edge_2020_df.to_csv("../dataset/migration2020-edges.csv", index=False)