In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [3]:
df = pd.read_csv("../Resources/full_data.csv").assign(
    country_id=lambda df: df["country_id"].replace(
        {
            1: "United Kingdom",
            2: "Austria",
            3: "Belgium",
            4: "Denmark",
            5: "Finland",
            6: "France",
            7: "Germany",
            8: "Ireland",
            9: "Italy",
            10: "Luxembourg",
            11: "Netherlands",
            12: "Norway",
            13: "Portugal",
            14: "Russia",
            15: "Spain",
            16: "Sweden",
            17: "Switzerland",
            18: "Greece",
            19: "Hungary",
            20: "Turkey",
            21: "Canada",
            22: "Australia",
            23: "New Zealand",
            24: "Japan",
            25: "China",
            26: "Hong Kong",
            27: "India",
            28: "Malaysia",
            29: "Philippines",
            30: "Singapore",
            31: "South Korea",
            32: "Israel",
            33: "Indonesia",
            34: "Pakistan",
            35: "Thailand",
            36: "Kuwait",
            37: "United Arab Emirates",
            38: "Argentina",
            39: "Brazil",
            40: "Chile",
            41: "Mexico",
            42: "Panama",
            43: "Venezuela",
            44: "South Africa",
            45: "Liberia",
        }
    ),
    region_id=lambda df: df["region_id"].replace(
        {1: "Europe", 2: "Canada & Pacific", 3: "Asia", 4: "Latin America", 5: "Africa"}
    ),
)

In [4]:
df

Unnamed: 0,year,year_id,countries,country_id,regions,region_id,western_emerging,west_emerge_id,capital,latitude,...,fdi_in_usa_million,globalization_100,gdp_per_capita_usd,interaction_gdp,fdi_by_usa_million,interaction_us_fdi,stock_market_capitalization_gdp,interaction_stock_mkt,government_effectiveness,rule_of_law
0,1982,1,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,28447.0,78,9146.077357,713394.03380,27537.0,2147886.0,38.10340,2972.06520,1.88,1.63
1,1983,2,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,32152.0,78,8691.518813,677938.46740,28086.0,2190708.0,46.12510,3597.75780,1.88,1.63
2,1984,3,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,38387.0,78,8179.194441,637977.16640,29265.0,2282670.0,51.22640,3995.65920,1.88,1.63
3,1985,4,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,43555.0,78,8652.216542,674872.89030,34066.0,2657148.0,72.24320,5634.96960,1.88,1.63
4,1986,5,United Kingdom,United Kingdom,Europe,Europe,Western,1,London,51.5085,...,55935.0,78,10611.112210,827666.75240,36974.0,2883972.0,78.62570,6132.80460,1.88,1.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1705,2015,34,Liberia,Liberia,Africa,Africa,Emerging,2,Monrovia,6.3005,...,501.0,47,710.383858,33388.04131,1006.0,47282.0,0.42386,19.92142,-1.36,-0.86
1706,2016,35,Liberia,Liberia,Africa,Africa,Emerging,2,Monrovia,6.3005,...,479.0,47,714.623392,33587.29943,756.0,35532.0,0.42386,19.92142,-1.28,-0.95
1707,2017,36,Liberia,Liberia,Africa,Africa,Emerging,2,Monrovia,6.3005,...,457.0,48,698.701764,33537.68469,598.0,28704.0,0.42386,20.34528,-1.34,-0.95
1708,2018,37,Liberia,Liberia,Africa,Africa,Emerging,2,Monrovia,6.3005,...,466.0,48,677.322179,32511.46457,171.0,8208.0,0.42386,20.34528,-1.34,-0.99


In [5]:
target = "fdi_in_usa_million"

In [6]:
X = df.drop(
    [
        "year_id",
        "countries",
        "regions",
        "western_emerging",
        "west_emerge_id",
        "capital",
        "latitude",
        "longitude",
        "fdi_in_usa_million",
        "globalization_100",
        "gdp_per_capita_usd",
        "interaction_gdp",
        "fdi_by_usa_million",
        "interaction_us_fdi",
        "stock_market_capitalization_gdp",
        "interaction_stock_mkt",
        "government_effectiveness ",
        "rule_of_law",
    ],
    axis=1,
)
y = df[target].values

In [7]:
X.columns

Index(['year', 'country_id', 'region_id'], dtype='object')

In [8]:
cf = ColumnTransformer(
    [
        ("numerical", "passthrough", ["year"]),
        ("categorical", OneHotEncoder(drop="first"), ["country_id", "region_id"]),
    ]
)

In [9]:
lr_pipeline = make_pipeline(cf, LinearRegression())
interaction__lr_pipeline = make_pipeline(
    cf, PolynomialFeatures(interaction_only=True), LinearRegression()
)
Lasso_pipeline = make_pipeline(cf, Lasso(max_iter=2000))
decision_tree_pipeline = make_pipeline(cf, DecisionTreeRegressor())
random_forest_pipeline = make_pipeline(cf, RandomForestRegressor())

In [10]:
k_fold = KFold(n_splits=6, shuffle=True, random_state=42)

In [11]:
cross_validate(lr_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold)[
    "test_score"
].mean()

-2436794175.4576497

In [12]:
cross_validate(
    interaction__lr_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold
)["test_score"].mean()

-454227662.8275923

In [13]:
cross_validate(Lasso_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold)[
    "test_score"
].mean()

  max_iter, tol, rng, random, positive)


-2436645593.4316287

In [14]:
cross_validate(
    decision_tree_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold
)["test_score"].mean()

-158113544.12661424

In [15]:
cross_validate(
    random_forest_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold
)["test_score"].mean()

-107916801.90362944

In [16]:
random_forest_pipeline.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('numerical', 'passthrough',
                                                  ['year']),
                                                 ('categorical',
                                                  OneHotEncoder(drop='first'),
                                                  ['country_id',
                                                   'region_id'])])),
                ('randomforestregressor', RandomForestRegressor())])

In [26]:
random_forest_pipeline.predict(
    pd.DataFrame({"year": [2019], "country_id": ["Greece"], "region_id": ["Europe"],})
)

array([965.])

In [27]:
random_forest_pipeline.predict(
    pd.DataFrame(
        {"year": [2019], "country_id": ["United Kingdom"], "region_id": ["Europe"],}
    )
)

array([501796.11])

In [28]:
r_sq_rf = random_forest_pipeline.score(X, y)
r_sq_rf

0.9982175693706118

In [29]:
1 / (1 - r_sq_rf)

561.031651674003

In [18]:
# rf -111542843.17663448
# Boxplot for outliers
# Multicollinearity of Xs
# AOC Curve
# OLS
# SHAP value and graph

In [19]:
# Model running and testing 1: Linear Multiple Regression (number of jobs)
# Model running and testing 2: Linear Multiple Regression with Interaction (number of jobs)
# Model running and testing 3: Decision Tree (parameters)
# Model running and testing 4: Random Forest (parameters)

In [20]:
# Best model with full data 1982-2019
# Try best model with lagged data by 1 yr
# Try best model with lagged data by 2 yr avg