In [1]:
import data_matrix as dm
import numpy as np

In [2]:
from sklearn.externals import joblib
from sklearn.svm import SVR

In [3]:
country_whitelist = [b"USA",b"CHN",b"IDN",b"IND",b"BRA",b"PAK",b"NGA",b"BGD",b"RUS",b"JPN",b"MEX",b"PHL",b"ETH",b"VNM",b"EGY",b"ZAR",b"DEU",b"IRN",b"TUR",b"FRA",b"THA",b"GBR",b"ITA",b"TZA",b"ZAF",b"MMR",b"PRK",b"COL"]
dm.apply_country_whitelist(country_whitelist)

In [4]:
help(SVR)

Help on class SVR in module sklearn.svm.classes:

class SVR(sklearn.svm.base.BaseLibSVM, sklearn.base.RegressorMixin)
 |  Epsilon-Support Vector Regression.
 |  
 |  The free parameters in the model are C and epsilon.
 |  
 |  The implementation is based on libsvm.
 |  
 |  Read more in the :ref:`User Guide <svm_regression>`.
 |  
 |  Parameters
 |  ----------
 |  C : float, optional (default=1.0)
 |      Penalty parameter C of the error term.
 |  
 |  epsilon : float, optional (default=0.1)
 |       Epsilon in the epsilon-SVR model. It specifies the epsilon-tube
 |       within which no penalty is associated in the training loss function
 |       with points predicted within a distance epsilon from the actual
 |       value.
 |  
 |  kernel : string, optional (default='rbf')
 |       Specifies the kernel type to be used in the algorithm.
 |       It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or
 |       a callable.
 |       If none is given, 'rbf' will be used. I

In [5]:
def get_model(X,y):
    clf = SVR(kernel='linear')
    clf.fit(X, y)
    return clf

# Simple Example

In [6]:
women_wf = b"SL.TLF.TOTL.FE.ZS"

In [7]:
X,y = dm.get_training_matrix(p_sc=women_wf,cc=b"USA")
svm = get_model(X[:-1],y[:-1])

In [8]:
predicted_slope = svm.predict([X[-1]])
predicted_slope

array([-0.04224654])

In [9]:
# Take the 2013 value and add the predicted slope
prediction = dm.get_feature_value(women_wf, b"USA", 2013) + predicted_slope[0]
actual = dm.get_feature_value(women_wf, b"USA", 2014)
previous_years = [dm.get_feature_value(women_wf, b"USA", y) for y in range(2005,2015)]

In [10]:
print("For the year 2014, i predict {}, but the actual value is {}".format(prediction, actual))
print("Historically, the values have been {}".format(previous_years))

For the year 2014, i predict 45.75041321858408, but the actual value is 45.83927536010742
Historically, the values have been [45.764839, 45.756645, 45.805981, 45.950191, 46.125431, 46.205242, 46.057381, 45.936733, 45.79266, 45.839275]


# Testing a spike

In [11]:
X,y = dm.get_training_matrix(p_sc=women_wf,cc=b"USA")
svm = get_model(X,y)

In [12]:
data_2015, key_2015 = dm.get_xrow(women_wf, b"USA", 2015)

In [13]:
# Add the spike
spike_sc = b"SL.TLF.TOTL.IN"
spike_cc = b"USA"
spike_yr = 2015
spike_amt = 20000000 # purposely very large

In [14]:
print("These are the SC slopes we'll add the spike to")
print(key_2015[(key_2015[:,0] == spike_sc) & (key_2015[:,1] == spike_cc)])
print("These are the current slopes")
print(data_2015[(key_2015[:,0] == spike_sc) & (key_2015[:,1] == spike_cc)])

These are the SC slopes we'll add the spike to
[[b'SL.TLF.TOTL.IN' b'USA' b'2014' b'2013']
 [b'SL.TLF.TOTL.IN' b'USA' b'2014' b'2012']]
These are the current slopes
[ 1197696.  1719312.]


In [15]:
spike_X,key_2015 = dm.get_xrow_with_spike(p_sc=women_wf, p_cc=b"USA", year=spike_yr, spike_sc=spike_sc, spike_cc=spike_cc, spike_amt=spike_amt)

print("These are the new slopes")
print(spike_X[(key_2015[:,0] == spike_sc) & (key_2015[:,1] == spike_cc)])

These are the new slopes
[ 21197696.  21719312.]


In [16]:
# We can use this spike for prediction
print("Predicted slope w/o spike {}, with spike {}".format(svm.predict(data_2015), svm.predict(spike_X)))

Predicted slope w/o spike [-0.04755117], with spike [-0.04755117]




In [17]:
def spike_effect_across_world(p_sc, spike_sc, spike_cc, spike_amt):
    result = {}
    # loop through countries
    for p_cc in country_whitelist:
        # Get the model
        X,y = dm.get_training_matrix(p_sc=p_sc,cc=p_cc)
        svm = get_model(X,y)
        data_2015, key_2015 = dm.get_xrow(p_sc, b"USA", 2015)
        normal_prediction = svm.predict(data_2015)[0]
        spike_X,key_2015 = dm.get_xrow_with_spike(p_sc=p_sc, p_cc=p_cc, year=2015, spike_sc=spike_sc, spike_cc=spike_cc, spike_amt=spike_amt)
        spike_prediction = svm.predict(spike_X)[0]
        result[p_cc] = spike_prediction - normal_prediction
    
    avg = np.sum(result[k] for k in result.keys()) / len(result.keys())
    for k in result.keys():
        result[k] = result[k] / avg
    
    return result

spike_effect_across_world(women_wf, spike_sc, spike_cc, spike_amt)
    
    



{b'BGD': 1.6301019477550582,
 b'BRA': 1.6057814751213868,
 b'CHN': 1.5777558400638276,
 b'COL': 0.52513754772626986,
 b'DEU': 1.0436058695635133,
 b'EGY': 1.0427062320878591,
 b'ETH': 1.0445868147702679,
 b'FRA': 0.66687001925868628,
 b'GBR': 0.47839425709920136,
 b'IDN': 1.5624179840955841,
 b'IND': 1.536512803982836,
 b'IRN': 0.64755126650338479,
 b'ITA': 0.66024207602290663,
 b'JPN': 1.7377099687398345,
 b'MEX': 1.0411743791106201,
 b'MMR': 0.52504604545692901,
 b'NGA': 1.5924559059224326,
 b'PAK': 1.5913690571406598,
 b'PHL': 1.047036490000514,
 b'PRK': 0.52513754772583376,
 b'RUS': 1.6300527912129132,
 b'THA': 0.47751352434005823,
 b'TUR': 0.64755126650901051,
 b'TZA': 0.53769933492287048,
 b'USA': 9.8369051657603786e-11,
 b'VNM': 1.0442923154085333,
 b'ZAF': 0.53767086232395089,
 b'ZAR': 1.0436263770366898}

In [19]:
open("svme.py", 'w').write("\n#-------\n".join(In[1:18]))

2985