In [1]:
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
rawData = pd.read_csv("WEOOct2021all.csv")
rawData

Unnamed: 0,WEO Country Code,ISO,WEO Subject Code,Country,Subject Descriptor,Subject Notes,Units,Scale,Country/Series-specific Notes,1980,...,2018,2019,2020,2021,2022,2023,2024,2025,2026,Estimates Start After
0,512,AFG,NGDP_R,Afghanistan,"Gross domestic product, constant prices",Expressed in billions of national currency uni...,National currency,Billions,Source: National Statistics Office Latest actu...,,...,1270.22,1319.90,1288.87,,,,,,,2019.0
1,512,AFG,NGDP_RPCH,Afghanistan,"Gross domestic product, constant prices",Annual percentages of constant price GDP are y...,Percent change,,"See notes for: Gross domestic product, consta...",,...,1.189,3.912,-2.351,,,,,,,2019.0
2,512,AFG,NGDP,Afghanistan,"Gross domestic product, current prices",Expressed in billions of national currency uni...,National currency,Billions,Source: National Statistics Office Latest actu...,,...,1327.69,1469.60,1547.29,,,,,,,2019.0
3,512,AFG,NGDPD,Afghanistan,"Gross domestic product, current prices",Values are based upon GDP in national currency...,U.S. dollars,Billions,"See notes for: Gross domestic product, curren...",,...,18.401,18.876,20.136,,,,,,,2019.0
4,512,AFG,PPPGDP,Afghanistan,"Gross domestic product, current prices",These data form the basis for the country weig...,Purchasing power parity; international dollars,Billions,"See notes for: Gross domestic product, curren...",,...,77.406,81.873,80.912,,,,,,,2019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8815,698,ZWE,GGXWDG,Zimbabwe,General government gross debt,Gross debt consists of all liabilities that re...,National currency,Billions,Source: Ministry of Finance or Treasury Latest...,,...,14.459,181.867,968.863,1187.21,1615.81,1924.61,2169.96,2357.23,2518.86,2019.0
8816,698,ZWE,GGXWDG_NGDP,Zimbabwe,General government gross debt,Gross debt consists of all liabilities that re...,Percent of GDP,,See notes for: General government gross debt ...,,...,61.486,113.923,86.147,54.013,60.311,62.027,62.24,60.688,59.609,2019.0
8817,698,ZWE,NGDP_FY,Zimbabwe,Gross domestic product corresponding to fiscal...,Gross domestic product corresponding to fiscal...,National currency,Billions,Source: Ministry of Finance or Treasury Latest...,,...,23.516,159.641,1124.67,2198.00,2679.15,3102.88,3486.45,3884.16,4225.61,2019.0
8818,698,ZWE,BCA,Zimbabwe,Current account balance,Current account is all transactions other than...,U.S. dollars,Billions,Source: Reserve Bank of Zimbabwe and Ministry ...,-0.301,...,-1.38,0.92,1.275,1.262,1.06,0.621,0.148,-0.247,-0.683,2019.0


In [3]:
# Compile dict of WEO Subject Code to description
codesDict = {}
codes = rawData[["WEO Subject Code","Subject Descriptor","Units"]]
for weocode in pd.unique(codes["WEO Subject Code"])[:-1]:
    codesDict[weocode] = codes[codes.values == weocode]['Subject Descriptor'].iloc[0] + \
        " (" + codes[codes.values == weocode]['Units'].iloc[0] + ")"

In [4]:
startYear = 1980
endYear = 2026

# Create fresh copy of data
data = rawData.copy()

# Remove NaN rows
data = data[~data['Country'].isna()]

# Remove unneeded columns
data = data.drop(columns=["WEO Country Code", "ISO", "Subject Descriptor", "Subject Notes", "Units", "Scale", "Country/Series-specific Notes"])

# Get unique values of Country and WEO Subject Codes
countries = data["Country"].unique()
codes = list(codesDict.keys())
# Find where the estimates start for each country
estimatesStartAfter = {}
for country in countries:
    estimatesStartAfter[country] = np.int_(data.loc[rawData["Country"] == country]["Estimates Start After"].min())
    
# Remove predicted data
for country in countries:
    for year in range(estimatesStartAfter[country] + 1, endYear + 1):
        data.loc[data["Country"] == country, str(year)] = np.nan

# Remove Estimates start after column
data = data.drop(columns="Estimates Start After")

# Remove any columns with all NaN
numRows = len(data)
dropColumns = []
for column in range(startYear, endYear + 1):
    if data[str(column)].isna().sum() == numRows:
        dropColumns.append(str(column))
data = data.drop(columns=dropColumns)
endYear -= len(dropColumns)

# Turn float data into float64 dtype
yearColumns = [str(x) for x in range(startYear, endYear + 1)]
data[yearColumns] = data[yearColumns].replace(',','', regex=True)
data[yearColumns] = data[yearColumns].replace('(-)+','', regex=True)
data[yearColumns] = data[yearColumns].replace(r'^\s*$', np.nan, regex=True)
data[yearColumns] = data[yearColumns].astype('float64')

data

Unnamed: 0,WEO Subject Code,Country,1980,1981,1982,1983,1984,1985,1986,1987,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,NGDP_R,Afghanistan,,,,,,,,,...,958.266,1092.120,1154.180,1185.310,1197.010,1222.920,1255.290,1270.220,1319.900,
1,NGDP_RPCH,Afghanistan,,,,,,,,,...,6.479,13.968,5.683,2.697,0.988,2.164,2.647,1.189,3.912,
2,NGDP,Afghanistan,,,,,,,,,...,836.222,1033.590,1116.830,1183.040,1226.570,1222.920,1285.460,1327.690,1469.600,
3,NGDPD,Afghanistan,,,,,,,,,...,17.890,20.293,20.170,20.616,20.057,18.020,18.883,18.401,18.876,
4,PPPGDP,Afghanistan,,,,,,,,,...,50.334,59.945,63.784,69.444,72.056,70.098,74.712,77.406,81.873,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8815,GGXWDG,Zimbabwe,,,,,,,,,...,6.044,6.549,7.053,8.230,9.477,10.089,11.998,,,
8816,GGXWDG_NGDP,Zimbabwe,,,,,,,,,...,42.858,38.265,36.944,42.211,47.473,49.096,54.433,,,
8817,NGDP_FY,Zimbabwe,,,,,,,,,...,14.102,17.115,19.091,19.496,19.963,20.549,22.041,,,
8818,BCA,Zimbabwe,0.301,0.674,0.748,0.504,0.171,0.153,0.051,0.0,...,2.750,2.278,2.649,2.334,1.597,0.697,0.271,,,


In [5]:
def data_subset(data, countries, codes):
    return data[(data["Country"].isin(countries)) & (data["WEO Subject Code"].isin(codes))]

In [6]:
def normalize(data):
    normalized_data = data.copy()
        
    codes = data["WEO Subject Code"].unique()
    
    yearColumns = data.columns[2:]
    
    for code in codes:
        mean = data[data["WEO Subject Code"] == code][yearColumns].stack().mean()
        std = data[data["WEO Subject Code"] == code][yearColumns].stack().std()
        
        normalized_data.loc[normalized_data["WEO Subject Code"] == code, yearColumns] = \
                data.loc[normalized_data["WEO Subject Code"] == code,yearColumns].applymap(
                    lambda x : (x - mean) / std)    
    return normalized_data

In [7]:
def data_to_time_series(data, width, startYear, endYear):
    
    countries = data["Country"].unique()
    codes = data["WEO Subject Code"].unique()

    columns = ["Country", "End Year"]
    for code in codes:
        for year in range(width):
            columns.append(code + "-" + str(year))
    
    numRows = len(countries) * ((endYear - startYear) - width + 2)
    numCols = len(columns)
    
    time_series = pd.DataFrame(data=np.empty((numRows, len(columns))),columns=columns)
    
    index = 0
    for country in countries:
        for endYear in range(startYear + width - 1, endYear + 1):
            newRow = np.empty(numCols, dtype=np.object)
            newRow[0:2] = [country,endYear]
            
            yearColumns = [str(x) for x in range(endYear - width + 1, endYear + 1)]
            startSlice = 2
            endSlice = startSlice + width
            for code in codes:
                test = data[(data["Country"] == country) & (data["WEO Subject Code"] == code)][yearColumns]
                if test.shape == (1, width):
                    newRow[startSlice:endSlice] = test
                startSlice += width
                endSlice += width            
            time_series.iloc[index] = newRow
            index += 1
    
    time_series["Country"] = time_series["Country"].astype(str)
    time_series["End Year"] = time_series["End Year"].astype("int64")
                
    return time_series

In [8]:
def feature_label_split_time_series(timeSeries, featureWidth=9, labelWidth=1, 
                                    featureParams=None, labelParams=None):    
    # Determine which columns of the timeSeries are features and which are labels
    featureColumns = []
    for code in featureParams:
        for year in range(featureWidth):
            featureColumns.append(code + '-' + str(year))
    
    labelColumns = []
    for code in labelParams:
        for year in range(featureWidth, featureWidth + labelWidth):
            labelColumns.append(code + '-' + str(year))
    
    timeSeriesTrimmed = timeSeries[featureColumns + labelColumns].dropna()
    
    features = pd.DataFrame(
            data=timeSeriesTrimmed[featureColumns],
            columns=featureColumns)
    
    labels = pd.DataFrame(
            data=timeSeriesTrimmed[labelColumns],
            columns=labelColumns)
    
    return features, labels
    

In [9]:
def time_series_to_tf_dataset(features, labels, featureWidth, labelWidth):
    # Turn each dataframe into their ndarray equivalents
    featureArray = features.values
    labelArray = labels.values
    
    numEntries = len(featureArray)
    
    # Reshape from (Entry, feature/time) -> (Entry, time, feature)
    featureArray = \
            np.array([ 
                [ 
                    [ entry[i * featureWidth + time ] for i in range(len(entry) // featureWidth)]
                for time in range(featureWidth)]
            for entry in featureArray ])
    
    labelArray = \
            np.array([ 
                [ 
                    [ entry[i * labelWidth + time ] for i in range(len(entry) // labelWidth)]
                for time in range(labelWidth)]
            for entry in labelArray ])
    
    featureDataset = tf.data.Dataset.from_tensor_slices(featureArray)
    labelDataset = tf.data.Dataset.from_tensor_slices(labelArray)
    return tf.data.Dataset.zip((featureDataset, labelDataset))

In [10]:
def train_validate_test_split(dataset, trainSplit=0.8, valSplit=0.1, testSplit=0.1, shuffle=True):
    assert (trainSplit + valSplit + testSplit) == 1
    shuffleSize=10000
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        dataset = dataset.shuffle(shuffleSize, seed=0)
    
    dsSize = len(dataset)
    
    trainSize = int(trainSplit * dsSize)
    valSize = int(valSplit * dsSize)
    
    trainDataset = dataset.take(trainSize).batch(32)
    valDataset = dataset.skip(trainSize).take(valSize).batch(32)
    testDataset = dataset.skip(trainSize).skip(valSize).batch(32)
    
    return trainDataset, valDataset, testDataset

In [11]:
#data = data_subset(data, ["United States", "Afghanistan"], ["PPPGDP"])

In [12]:
# Constants
featureWidth = 9
labelWidth = 1
width = featureWidth + labelWidth
featureParams = ["PPPGDP"]
labelParams = ["PPPGDP"]
trainSplit=0.8
valSplit=0.1
testSplit=0.1
shuffle=True

# Created a normalized version of the data
normalizedData = normalize(data)

# Turn both data and normalized_data into time series
timeSeries = data_to_time_series(data, width, startYear, endYear)
normalizedTimeSeries = data_to_time_series(normalizedData, width, startYear, endYear)

# Separate features and labels on both time series. Also removes any semiempty rows
features, labels = feature_label_split_time_series(
        timeSeries, featureWidth, labelWidth, featureParams, labelParams)
normalizedFeatures, normalizedLabels = feature_label_split_time_series(
        normalizedTimeSeries, featureWidth, labelWidth, featureParams, labelParams)

# Turn the normalizedFeatures and labels into a tensorflow dataset
dataset = time_series_to_tf_dataset(normalizedFeatures, labels, featureWidth, labelWidth)

# Split the dataset into trainDataset, valDataset, and testDataset
trainDataset, valDataset, testDataset = \
        train_validate_test_split(dataset, trainSplit, valSplit, testSplit, shuffle)

In [13]:
MAX_EPOCHS = 20

def compile_and_fit(model, train_ds, val_ds, patience=2):
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=patience,
                                                    mode='min')

  model.compile(loss=tf.losses.MeanSquaredError(),
                optimizer=tf.optimizers.Adam(),
                metrics=[tf.metrics.MeanAbsoluteError()])

  history = model.fit(train_ds, epochs=MAX_EPOCHS,
                      validation_data=val_ds,
                      callbacks=[early_stopping])
  return history

In [14]:
linear = tf.keras.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=1),
    tf.keras.layers.Reshape([1, -1])
])

history = compile_and_fit(linear, trainDataset, valDataset)

print(linear.evaluate(valDataset))
print(linear.evaluate(testDataset, verbose=0))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
[2194137.5, 439.66766357421875]
[3398833.5, 488.4991760253906]


In [15]:
dense = tf.keras.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=1),
    tf.keras.layers.Reshape([1, -1])
])

history = compile_and_fit(dense, trainDataset, valDataset)

print(dense.evaluate(valDataset))
print(dense.evaluate(testDataset, verbose=0))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
[49153.53125, 51.500125885009766]
[10602.6884765625, 34.69462966918945]


In [16]:
cnnTest = tf.keras.Sequential([
    tf.keras.layers.Conv1D(
            filters=32,
            kernel_size=(9,),
           activation='relu'),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=1),
])

history = compile_and_fit(cnnTest, trainDataset, valDataset)

print(cnnTest.evaluate(valDataset))
print(cnnTest.evaluate(testDataset, verbose=0))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
[73858.984375, 79.92478942871094]
[31445.1015625, 65.10983276367188]


In [17]:
print(tf.__version__)

import tensorflow.python.platform.build_info as build
print(build.build_info['cuda_version'])
print(build.build_info['cudnn_version'])


2.7.0
11.2
8
