In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from itertools import product
import country_converter as coco

In [2]:
# data from CEPII -- http://www.cepii.fr/CEPII/en/bdd_modele/bdd_modele_item.asp?id=8
trade_df = pd.read_csv("../data/Gravity_V202211.csv")
trade_df = trade_df[trade_df["year"] == 2020]
trade_df['country_id_o'] = trade_df['country_id_o'].map(lambda x: x.replace('.2',''))
trade_df['country_id_d'] = trade_df['country_id_d'].map(lambda x: x.replace('.2',''))

  trade_df = pd.read_csv("../data/Gravity_V202211.csv")


In [3]:
# trade is DIRECTED -- take the average
key_columns = ["country_id_o", "country_id_d", "tradeflow_baci"]
ud_trade1 = trade_df[key_columns]
ud_trade2 = trade_df.rename(columns={"country_id_o":"country_id_d", "country_id_d":"country_id_o"})
ud_trade2 = ud_trade2[key_columns]
ud_trade = pd.concat([ud_trade1, ud_trade2])

ud_trade = ud_trade.groupby(['country_id_o', 'country_id_d'])['tradeflow_baci'].mean().reset_index()
ud_trade = ud_trade[ud_trade["country_id_o"] < ud_trade["country_id_d"]]

In [4]:
columns_to_keep = [
    "year",
    "country_id_o",
    "country_id_d",
    "iso3_o",
    "iso3_d",
    #"tradeflow_baci",
    "gmt_offset_2020_o",
    "gmt_offset_2020_d",
    "distw_harmonic",
    "dist",
    "scaled_sci_2021",
    "pop_o",
    "pop_d",
    "gdp_o",
    "gdp_d",
    "gdpcap_o",
    "gdpcap_d",
    "gdp_ppp_o",
    "gdp_ppp_d",
    "gdpcap_ppp_o",
    "gdpcap_ppp_d"
]

In [5]:
# bring back the key variables
ud_trade = pd.merge(
    ud_trade,
    trade_df[columns_to_keep],
    on=["country_id_o", "country_id_d"],
    how="left"
)

In [6]:
# add github data
gh_df = pd.read_csv("../data/economy_collaborators.csv")
gh_df = gh_df[(gh_df["source"] != "EU") & (gh_df["destination"] != "EU")]
gh_df = gh_df.dropna(subset=["source", "destination"])
gh_df["iso3_o"] = coco.convert(names=gh_df["source"], to="ISO3")
gh_df["iso3_d"] = coco.convert(names=gh_df["destination"], to="ISO3")
gh_df = gh_df.dropna(subset=["source", "destination"])

In [7]:
# github is DIRECTED -- take the average
gh_df = gh_df\
    .loc[gh_df["year"] == 2020]\
    .groupby(["year", "iso3_o", "iso3_d"]).agg(gh_vol = pd.NamedAgg("weight", np.mean))\
    .reset_index()\
    .rename(columns={"iso3_o":"country_id_o", "iso3_d":"country_id_d"})

key_columns = ["country_id_o", "country_id_d", "gh_vol"]
ud_gh1 = gh_df[key_columns]
ud_gh2 = gh_df.rename(columns={"country_id_o":"country_id_d", "country_id_d":"country_id_o"})
ud_gh2 = ud_gh2[key_columns]
ud_gh = pd.concat([ud_gh1, ud_gh2])

ud_gh = ud_gh.groupby(['country_id_o', 'country_id_d'])['gh_vol'].mean().reset_index()
ud_gh = ud_gh[ud_gh["country_id_o"] < ud_gh["country_id_d"]]

  .groupby(["year", "iso3_o", "iso3_d"]).agg(gh_vol = pd.NamedAgg("weight", np.mean))\


In [8]:
# join
gravity_df = pd.merge(
    ud_trade,
    ud_gh,
    on = ["country_id_o", "country_id_d"],
    how="left"
)

In [9]:
# export
gravity_df.to_csv("../outputs/data_gravity_regressions.csv", sep=";", index=False)