In [44]:
import pandas as pd
import os
import sys 
import numpy as np
from functools import reduce
import itertools
from ast import literal_eval #converts object list to list of strings
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import statsmodels.api as sm
from statsmodels.formula.api import ols

# this points to a Python file with the function country_mappings (not used)
from combine_country_regions import country_mappings

# not great practice, but this removes warnings from the output
import warnings
warnings.filterwarnings("ignore")

# display settings so I can see more on the screen
desired_width=1000
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns',10)
pd.options.display.max_rows = 50

# warnings
import warnings
warnings.filterwarnings("ignore")

# plots
settings = {'figure.figsize':(14,4),
            'figure.dpi':144,
            'figure.facecolor':'w',
            'axes.spines.top':False,
            'axes.spines.bottom':False,
            'axes.spines.left':False,
            'axes.spines.right':False,
            'axes.grid':True,
            'grid.linestyle':'--',
            'grid.linewidth':0.5,
            'figure.constrained_layout.use':True}
plt.rcParams.update(settings)


In [45]:

#############################################################
# set this to point to your folder or create a new folder,
# (in my case my computer is called jpark and I called the folder trade_warning) 
#############################################################
os.chdir(r'C:\Users\jpark\VisualStudio\Simpsons_BACI\\')
baci_data = r"C:\Users\jpark\Downloads\BACI_HS92_V202401b"


In [46]:

# points to country codes as defined by BACI
COUNTRY_CODES = baci_data + "\country_codes_V202401b.csv"
# point to product codes
PRODUCT_DESCRIPTION = baci_data + "\product_codes_HS92_V202401b.csv"
# add region data, might be better sources
ADD_REGIONS = r"baci_preparation\iso_countries_regions.csv"
# add short HS2 description (could be better descriptions)
SHORT_CODES = r"baci_preparation\hs6twodigits.csv"
# add long product description
LONG_DESCRIPTION = baci_data + "\product_codes_HS92_V202401b.csv"
# add gdp data
GDP_DATA = r"baci_preparation\global_gdp.csv"

class baci:
    '''baci class contains the methods to load baci data and add characteristics such as geographic and strategic'''
    def readindata(self, bacidata, verbose = False, tmp_save = True) -> pd.DataFrame:
        '''main method to read in baci data'''
        df1 = pd.read_csv(bacidata, usecols=['t','i','j','k','v','q'], 
                          dtype= {'t': 'int64',
                                  'i': 'int64', 
                                  'j': 'int64', 
                                  'k': 'object',
                                  'v': 'float64',
                                  'q': 'object'}
                          )

        # This is too complicated, but '   NA' should be converted to float
        df1['q'] = df1['q'].apply(lambda x: x.strip()) # remove spaces in data
        df1['q'].replace('NA', np.NaN, inplace=True)   # np.NaN is different than string NaN
        df1['q'] = df1['q'].astype(float)

        # rename columns to make them meaningful to humans
        df1.rename(columns={'t': 'Year', 'i': 'Exporter', 'j': 'Importer', 'k': 'Product', 'v': 'Value', 'q': 'Quantity'}, inplace=True)

        ROW_COUNT = df1.shape[0]
        print("ROW_COUNT ORIGINAL: ", ROW_COUNT)

        # replace number with name of country *exporter* 
        iso1 = pd.read_csv(COUNTRY_CODES, usecols=['country_code', 'country_iso3'])
        df1 = df1.merge(iso1, left_on="Exporter", right_on="country_code", how="left")
        df1.drop(columns=['country_code', 'Exporter'], inplace = True)
        df1.rename(columns={"country_iso3": "Exporter"}, inplace=True)
    
        # replace number with name of country *importer*
        df1 = df1.merge(iso1, left_on="Importer", right_on="country_code", how="left")
        df1.drop(columns=['country_code', 'Importer'], inplace = True)
        df1.rename(columns={"country_iso3": "Importer"}, inplace=True)

        # 2015 has some strange data, take only Values greater than 10.00, otherwise number of exporting countries in 2015 is an outlier
        df1 = df1[df1['Value'] > 0.00]

        # if verbose is True, this will print out
        if verbose:
            hcodes = [str(x)[0:2] for x in df1["Product"]]
            print(set(hcodes))
            print(len(set(hcodes)))

        # make product code and int, otherwise its an object which can be confusing
        df1['Product'] = df1['Product'].astype(int)    

        ROW_COUNT = df1.shape[0]
        print("ROW_COUNT ORIGINAL2: ", ROW_COUNT)

        return df1
    
    def addprodcode(self, data):
        '''add the product description if needed'''
        # add product_codes
        prodcodes = pd.read_csv(PRODUCT_DESCRIPTION, usecols=['code', 'description'])
        # product '9999AA' appears to be a filler--empty
        mask = prodcodes['code'] == '9999AA'
        prodcodes = prodcodes[~mask]
        # I love merges, note its a left merge, I want all baci data to have a code, but dont care for product codes without products.
        data = data.merge(prodcodes, left_on = "Product", right_on = "code", how = "left")
        
        ROW_COUNT = data.shape[0]
        print("ROW_COUNT addprodcode: ", ROW_COUNT)
        
        return data
    
    def addshortdescriptoProdname(self, data):
        '''Add short product description based on codes'''

        localdata = data.copy()

        # this is necessary because codes 1:9 should be 01:09
        prod_h6 = pd.read_csv(SHORT_CODES, dtype = str)

        # this is necessary because codes 1:9 should be 01:09
        prod_h6.loc[:, 'code'] = ["0" + x if len(x) == 1 else x for x in prod_h6['code'].astype(str)]

        # this is necessary because codes 1:9 should be 01:09
        localdata.loc[:, 'code'] = ["0" + x if len(x) == 5 else x for x in localdata['Product'].astype(str)]

        # get first two numbers
        localdata['shrtDescription'] = localdata['code'].astype(str).str[0:2]
        
        proddesc = localdata.merge(prod_h6, left_on="shrtDescription", right_on="code")
        
        proddesc['product'] = proddesc['product'] + "_" + proddesc['shrtDescription']
        
        proddesc.drop(columns = {'code_x', 'shrtDescription', 'code_y'}, inplace = True)

        proddesc.rename(columns = {"product": "code"}, inplace = True)

        ROW_COUNT = proddesc.shape[0]
        print("ROW_COUNT addshortdescriptoProdname: ", ROW_COUNT)

        return proddesc
    
    def addlongdescription(self, data):
        '''Add product product description based on codes'''
        localdata = data.copy()
        longdesc = pd.read_csv(LONG_DESCRIPTION, dtype = str)

        # this is necessary because codes 1:9 should be 01:09
        localdata.loc[:, 'Product'] = ["0" + x if len(x) == 5 else x for x in localdata['Product'].astype(str)]

        longdesc.rename(columns = {"code": "isocode"}, inplace=True)
        longproddesc = localdata.merge(longdesc, left_on="Product", right_on="isocode", how = 'left', suffixes = ['x', 'y'])
       
        r1 = localdata.shape[0]
        r2 = longproddesc.shape[0]
        assert r1 == r2

        ROW_COUNT = longproddesc.shape[0]
        print("ROW_COUNT addlongdescription: ", ROW_COUNT)

        return longproddesc
    
    def add_gdp(self, data, GDP, year):
        '''Join GDP to data'''

        ### join GDP to data
        
        # Exporters
        gdp = GDP[GDP.index == year]
        gdp = gdp.T
        gdp['Exporter_gdp'] = gdp.index
        
        gdp.rename(columns={year: year + "_gdp_Exporter"}, inplace=True)

        dataj = data.merge(gdp, left_on = "Exporter", right_on = "Exporter_gdp")
        dataj[year + '_gdp_Exporter'] = dataj[year + '_gdp_Exporter']/1e+6
        
        # Importers
        gdp = GDP[GDP.index == year]
        gdp = gdp.T
        gdp['Importer_gdp'] = gdp.index
        gdp.rename(columns={year: year + '_gdp_Importer'}, inplace=True)

        data = dataj.merge(gdp, left_on = "Importer", right_on = "Importer_gdp")
       
        data.drop(columns = ["Exporter_gdp", "Importer_gdp"], inplace=True)

        ROW_COUNT = data.shape[0]
        print("ROW_COUNT add_gdp: ", ROW_COUNT)

        return data
    
    def add_chapter(self, data):
        pass
         

In [47]:

def GDPData():
    '''should alway be run, need to move to BACI class'''
    # https://data.worldbank.org/indicator/NY.GDP.MKTP.CD?end=2022&start=1960&view=chart
    # Taiwan comes from IMF data, added by hand. https://www.imf.org/external/datamapper/NGDPD@WEO/OEMDC/ADVEC/WEOWORLD
    
    data = pd.read_csv(GDP_DATA, index_col=[1], skiprows=4)
    data = data.drop(columns=['Country Name', 'Indicator Code', 'Indicator Name'])
    data = data.T
    return data
GDP = GDPData()
GDP_sum = pd.DataFrame(GDP.sum(axis=1))/1e05
GDP_sum.index = GDP_sum.index.astype("int")
GDP_sum.columns = ["World_GDP"]

In [48]:

# #############################################################
# # INITIALIZE object, needs to be run to create a BACI object instance
bc1 = baci()
# #############################################################

In [49]:
def BACI_through_time():
    years = np.arange(start=1995, stop=2023)

    allYears = []

    for yr in years:
        print(yr)
        bacidata = baci_data + "\BACI_HS92_Y" + str(yr) + "_V202401b.csv"
        test_data = bc1.readindata(bacidata, verbose = False, tmp_save = False)
        test_data = bc1.addshortdescriptoProdname(test_data)

        groupdata = test_data[['Value', 'Exporter']].groupby(['Exporter']).sum()
        groupdata['Year'] = yr
        groupdata['Exporters'] = groupdata.index
        groupdata.reset_index()
        groupdata.rename(columns={'Value': 'Exports'}, inplace = True)
        
        allYears.append(groupdata)

    data1 = pd.concat(allYears, axis=0)
   
    data1.to_csv("tmp_gdp_exports2.csv")

    return data1
    
#data1 = BACI_through_time()
data1 = pd.read_csv("tmp_gdp_exports2.csv")
data1.drop(columns={"Exporters"}, inplace = True)

tradedata = data1.sort_values(['Exporter', 'Year'], ascending=True)
tradedata


Unnamed: 0,Exporter,Exports,Year
0,ABW,644277.578,1995
213,ABW,780453.092,1996
426,ABW,977245.673,1997
639,ABW,648281.421,1998
852,ABW,891851.146,1999
...,...,...,...
5326,ZWE,4562434.495,2018
5552,ZWE,4080996.343,2019
5778,ZWE,4362891.750,2020
6004,ZWE,7861151.625,2021


In [50]:
def GDP_Long():
    GDPT = GDP.T
    GDPT['Country'] = GDPT.index
    GDPT = GDPT.melt(id_vars=['Country'], var_name='Year')
    GDPT.sort_values(['Country', 'Year'])

    GDPT.rename(columns = {'value': 'GDP'}, inplace = True)

    GDPT['Country_Year'] = GDPT['Country'] + "_" + GDPT['Year']

    return GDPT

GDP1 = GDP_Long()
GDP1


Unnamed: 0,Country,Year,GDP,Country_Year
0,ABW,1960,,ABW_1960
1,AFE,1960,1.847810e+10,AFE_1960
2,AFG,1960,5.377778e+08,AFG_1960
3,AFW,1960,1.041165e+10,AFW_1960
4,AGO,1960,,AGO_1960
...,...,...,...,...
16816,YEM,2022,,YEM_2022
16817,ZAF,2022,4.052710e+11,ZAF_2022
16818,ZMB,2022,2.916378e+10,ZMB_2022
16819,ZWE,2022,2.736663e+10,ZWE_2022


In [51]:
tradedata['Country_Year'] = tradedata['Exporter'] + "_" + tradedata['Year'].astype('str')

tradedata2 = tradedata.merge(GDP1, left_on="Country_Year", right_on="Country_Year")
tradedata2.drop(columns=['Year_y', 'Exporter'], inplace = True)
tradedata2.rename(columns={"Year_x": "Year"}, inplace=True)

tradedata = tradedata2.iloc[:, [3,1,2,0,4]]
tradedata.to_csv("data/tradedata_Exports_GDP.csv")
tradedata


Unnamed: 0,Country,Year,Country_Year,Exports,GDP
0,ABW,1995,ABW_1995,644277.578,1.320670e+09
1,ABW,1996,ABW_1996,780453.092,1.379888e+09
2,ABW,1997,ABW_1997,977245.673,1.531844e+09
3,ABW,1998,ABW_1998,648281.421,1.665363e+09
4,ABW,1999,ABW_1999,891851.146,1.722905e+09
...,...,...,...,...,...
5728,ZWE,2018,ZWE_2018,4562434.495,3.415607e+10
5729,ZWE,2019,ZWE_2019,4080996.343,2.183223e+10
5730,ZWE,2020,ZWE_2020,4362891.750,2.150970e+10
5731,ZWE,2021,ZWE_2021,7861151.625,2.837124e+10
