In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
import statsmodels.stats.api as sms
import matplotlib.pyplot as plt
import matplotlib.style as style
import plotly.graph_objects as go
import plotly.express as px
from matplotlib.colors import LinearSegmentedColormap
sns.set_theme(style='whitegrid')

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
df = pd.read_csv("~/Nextcloud/linkedin_recruiter/inputs/model_input.csv", low_memory=False)
square_df = pd.read_csv("~/Nextcloud/linkedin_recruiter/inputs/model_input_recip_pairs.csv", low_memory=False)
# for windows
# df = pd.read_csv("N:/johnson/linkedin_recruiter/inputs/model_input.csv")

In [4]:
cont_cols = ['flow', 'net_flow', 'net_rate_100', 'users_orig', 'users_dest',
            'pop_orig', 'gdp_orig', 'hdi_orig', 'pop_dest', 'gdp_dest', 'hdi_dest',
            'area_orig', 'area_dest', 'internet_orig', 'internet_dest',
            'dist_biggest_cities', 'dist_pop_weighted', 'dist_unweighted',
            'csl', 'cnl', 'prox2']
def get_log_cols(continuous_cols):
    log_vars = ['users', 'pop', 'gdp', 'area']
    return [
        f'{x}_{y}' for y in ['dest', 'orig']
        for x in log_vars
    # just use count of flow for now
    ] + ['flow', 'csl', 'cnl', 'prox2'] + [x for x in continuous_cols if 'dist' in x]
log_cols = get_log_cols(cont_cols)

In [5]:
def log_tform(df, log_cols):
    for col in log_cols:
        df[f'log_{col}'] = np.log(df[col])
    return df

In [6]:
df = log_tform(df, log_cols)
square_df = log_tform(square_df, log_cols + ['net_rate_100'])

In [7]:
ID_COLS = ['country_orig', 'country_dest', 'query_date']
DIST_COLS = [x for x in df.columns if 'dist' in x]
BINARY = ['contig', 'comlang_ethno', 'colony', 'comcol', 'curcol', 'col45', 'col']
# need this later
df['comcol_categ'] = df['comcol'].replace({0: 'No', 1: 'Yes'}).fillna('Unknown')
square_df['comcol_categ'] = square_df['comcol'].replace({0: 'No', 1: 'Yes'}).fillna('Unknown')
# split up separate dfs
eu = df[df['eu'] == 1]
square_eu = square_df[square_df['eu'] == 1]

### Number of country pairs

In [9]:
def get_num_pairs(df, col_name):
    return df.groupby('query_date')[['iso3_orig', 'iso3_dest']].count().drop(
        'iso3_dest', axis=1).rename(columns={'iso3_orig': col_name})
pd.concat(
    [get_num_pairs(df, 'pairs'),
     get_num_pairs(df[df['by_date_recip'] == 1], 'reciprocal pairs (by date)'),
     get_num_pairs(df[df['recip'] == 1], 'reciprocal pairs (across dates)'),
     get_num_pairs(eu, 'EU pairs'),
     get_num_pairs(eu[eu['by_date_recip'] == 1], 'EU reciprocal pairs (by date)'),
     get_num_pairs(eu[eu['recip'] == 1], 'EU reciprocal pairs (across dates)')], axis=1)

Unnamed: 0_level_0,pairs,reciprocal pairs (by date),reciprocal pairs (across dates),EU pairs,EU reciprocal pairs (by date),EU reciprocal pairs (across dates)
query_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-07-25,4582,1524,890,339,162,90
2020-10-08,4520,1450,890,333,160,90
2020-10-20,4704,1526,890,327,160,90
2020-11-04,4738,1478,890,313,138,90
2020-11-19,4694,1474,890,310,136,90
2020-12-01,4682,1474,890,309,134,90
2020-12-16,4662,1452,890,315,140,90
2020-12-31,4599,1422,890,302,126,90
2021-01-12,4516,1454,890,309,140,90
2021-01-27,4645,1420,890,307,132,90


# Only keep reciprocal pairs across all dates for all plots below

## Time

In [24]:
fig = px.box(
    square_df.sort_values(by='query_date'), y="log_flow",
    hover_data=['country_orig', 'country_dest', 'flow'],
    color='query_date',
    title='Global distribution of migration aspirations (bilateral flow)',
    labels={'log_flow': 'Migration Aspirations (log(number))',
            'country_dest': 'Destination', 'country_orig': 'Origin', 'eu': ''}
)
fig.show()

In [31]:
fig = px.box(
    square_eu.sort_values(by='query_date'), y='log_flow',
    hover_data=['country_orig', 'country_dest', 'flow'],
    color='query_date',
    title='EU distribution of migration aspirations (bilateral flow)',
    labels={'log_flow': 'Migration Aspirations (log(number))',
            'country_dest': 'Destination', 'country_orig': 'Origin'}
)
fig.show()

In [27]:
fig = px.box(
    square_df.sort_values(by='query_date'), y="log_net_rate_100",
    hover_data=['country_orig', 'country_dest', 'net_rate_100'],
    color='query_date',
    title='Global distribution of net rate of migration aspirations, per 100',
    labels={'log_net_rate_100': 'Net Rate per 100 (log)',
            'country_dest': 'Destination', 'country_orig': 'Origin'}
)
fig.show()

In [32]:
fig = px.box(
    square_eu.sort_values(by='query_date'), y="log_net_rate_100",
    hover_data=['country_orig', 'country_dest', 'net_rate_100'],
    color='query_date',
    title='EU distribution of net rate of migration aspirations, per 100',
    labels={'log_net_rate_100': 'Net Rate per 100 (log)',
            'country_dest': 'Destination', 'country_orig': 'Origin'}
)
fig.show()

## Difference between countries pairs that shared a common colonizer?

In [13]:
px.scatter(
    square_df.sort_values(by='query_date'), x='log_gdp_dest', y='log_flow',
    hover_data=['country_orig', 'country_dest', 'flow'], color='comcol_categ',
    title='Global Migration Aspirations vs. Destination GDP',
    labels={'log_flow': 'Migration Aspirations (log(number))',
            'flow': 'Migration Aspirations (number)',
            'log_gdp_dest': 'Destination GDP (log)', 'comcol_categ': 'Common Colonizer post 1945',
            'country_dest': 'Destination', 'country_orig': 'Origin'},
    color_discrete_map={'Unknown': '#D3D3D3', 'Yes': "#d55e00", 'No': "#0072b2"}, animation_frame="query_date"
)

In [33]:
fig = px.box(
    square_df.sort_values(by='query_date'), y="log_flow",
    hover_data=['country_orig', 'country_dest', 'flow'],
    color='comcol_categ',
    animation_frame="query_date",
    title='Global distribution of bilateral migration aspirations',
    labels={'log_flow': 'Migration Aspirations (log(number))',
            'comcol_categ': 'Shared Common Colonizer post 1945',
            'country_dest': 'Destination', 'country_orig': 'Origin'},
    color_discrete_map={'Unknown': '#D3D3D3', 'Yes': "#d55e00", 'No': "#0072b2"}
)
fig.show()