Same as 0.4 with linear regression, but here using Random Forest

In [None]:
import pandas as pd
import os
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import time

## Read data

In [None]:
from src.features import *
train = load_raw()
cols = imply_columns(train)

In [None]:
assert not train['Unnamed: 0'].duplicated().any()

In [None]:
train = train.set_index('Unnamed: 0')

In [None]:
train.shape, train.columns

In [None]:
# pd.isnull(train[cols['features']]).sum(axis=1).sort_values(ascending=True).head(n=20)
train.shape[0], len(np.where(pd.isnull(train[cols['features']]).sum(axis=1) < 5)[0])
# train.iloc[0,'1972 [YR1972]']
# pd.isnull(train.loc[0,'1972 [YR1972]'])
# train.head()
# train.loc[131876]

## read submission file

In [None]:
df_submit = pd.read_csv('data/raw/submission.csv').set_index('Unnamed: 0')
df_submit.head()

In [None]:
len(set(train.loc[df_submit.index]['Country Name']))

In [None]:
set(train.loc[df_submit.index]['Series Code'])

In [None]:
submit_inds = list(set(df_submit.index))
submit_inds.sort()
submit_inds[:10]

## split out meta

In [None]:
df_meta = train[cols['meta'][1:]]
for ccc in cols['meta'][1:]: del train[ccc]
df_meta.head()

In [None]:
df_meta.loc[:,"Country Code"] = pd.factorize(df_meta["Country Name"])[0]
df_meta.head(n=2)

## preprocess data to backfill nan

In [None]:
print(time.ctime(), 'start')
train2 = train.copy()
print(train2.shape)
train2 = train2.transpose().sort_index(ascending=True)
train2 = train2.fillna(method='bfill')
train2 = train2.fillna(method='ffill') # sort of cheating, but better than just 0
print(time.ctime(), 'end')

In [None]:
train2.index = [int(x[:4]) for x in train2.index]

In [None]:
# plot again just to confirm that wide-to-long worked
country = 'Kenya' # 'Poland'
series = 'Net taxes on products (current LCU)'
subtrain = df_meta[(df_meta['Country Name']==country) & (df_meta['Series Name']==series)]
assert subtrain.shape[0]==1
subtrain = subtrain.index[0]
subtrain = train2[subtrain]
subtrain.head()

In [None]:
plt.plot(subtrain)
plt.title("%s: %s"%(country, series))
plt.show()

In [None]:
n_nan = pd.isnull(train2).sum().sum()
assert n_nan==0

In [None]:
train2 = train2.reset_index().rename(columns={'index': 'year'}).set_index('year')
train2.head()

## unpivot table

In [None]:
train2 = (
    train2.reset_index()
       .melt(id_vars=['year'])
       .sort_values(['year'])
       .merge(df_meta.reset_index()[['Unnamed: 0', 'Country Name', 'Series Code']], on='Unnamed: 0', how='left')
)

In [None]:
train2.head()

## pivot on the series code

In [None]:
print(time.ctime(), 1.1)
train3 = train2.copy()
print(time.ctime(), 1.2)


# slow # train3['year-country'] = train3.apply(lambda row: "%s-%s"%(row['year'], row['Country Name']), axis=1)
# train3['year-country'] = train3['year']*1000000 + train3['Country Code']
# print(time.ctime(), 2)
# train3 = train3.pivot(index='year-country', columns='Series Code', values='value')
train3 = train3.pivot_table(index=['year','Country Name'], columns='Series Code', values='value')
print(time.ctime(), 3)

train3 = train3.fillna(value=0) # filling .. chose 0's for no specific reason
print(time.ctime(), 4)
train3.head()

In [None]:
train3.shape

In [None]:
cols_target = list(set(df_meta.loc[df_submit.index, 'Series Code']))
cols_target

In [None]:
target3 = train3[cols_target].loc[1973:]
features3 = train3.loc[:2006]

In [None]:
target3.head(n=2)

In [None]:
features3.tail(n=2)

In [None]:
plt.plot(train3['1.2'][1972], '.')
plt.show()

## <strike>multi-variate OLS</strike> Random Forest

In [None]:
from sklearn import ensemble


# Create linear regression object
regr = ensemble.RandomForestRegressor(n_estimators=100, min_samples_split=4, verbose=2, n_jobs=-1)

# Train the model using the training sets
print('fit', features3.shape, target3.shape)
print(time.ctime(), 'start')
regr.fit(features3, target3)
print(time.ctime(), 'done')

In [None]:
# check most important features
f_imp = regr.feature_importances_.argsort()
# regr.feature_importances_[f_imp[-20:]]
features3.columns[f_imp[-10:]]

In [None]:
# make feature importances into dataframe for saving
df_importances = (
    pd.DataFrame({'imp': regr.feature_importances_, 'feat': features3.columns})
      .sort_values('imp', ascending=False)
)
df_importances.head()

In [None]:
df_importances.to_pickle('data/interim/rf_feature_importances.pkl')

## prediction 2008

- 2008 is straight-forward
- ~~2012 is recursive prediction since 2008~~

In [None]:
def year2col(x):
    return "%.0f [YR%.0f]"%(x, x)

year2col(2008), year2col(2012)

In [None]:
df_submit.head()

In [None]:
def get_prediction(df_in, regr, df_target):
    df_pred1 = regr.predict(df_in)
    df_pred1 = pd.DataFrame(df_pred1, columns=df_target.columns, index=df_target.loc[2007].index.values)
    df_pred1 = df_pred1.reset_index().rename(columns={'index':'Country Name'}).melt(id_vars=['Country Name'])
    
    df_pred2 = df_pred1.merge(
        df_meta.reset_index()[['Unnamed: 0', 'Country Name', 'Series Code']], 
        on=['Country Name', 'Series Code'], 
        how='left'
    )
    df_pred2 = df_pred2[~pd.isnull(df_pred2['Unnamed: 0'])]
    df_pred2 = df_pred2.set_index('Unnamed: 0')
    df_pred2 = df_pred2.loc[df_submit.index]
    
    df_pred1 = df_pred1.pivot_table(index='Country Name', columns='Series Code', values='value')
    return df_pred1, df_pred2
    


In [None]:
df_pred1, df_pred2 = get_prediction(train3.loc[2007:], regr, target3)
df_submit[year2col(2008)] = df_pred2['value']
df_submit.head()

## Repeat for 2012

- 5-year prediction for 2012 from data till 2007
- horizontally stack features of T-1, T-2, ... T-5 (T-5 here is not the same as in 2012-2007 being 5-year pred)
- use the feature importances from the 2008 RF regression and choose top 10 only

In [None]:
top_cols = features3.columns[f_imp[-10:]]
# top_cols = features3.columns # no filtering for top feature importances

In [None]:
def do_pre(prefix):
    return {c1: "%s_%s"%(prefix,c1) for c1 in train3.columns}

def get_features(n_pred):
    return pd.concat([
            #train3[top_cols].loc[1972+5-5:2007-4-n_pred].rename(columns=do_pre('t5')).reset_index(drop=True),
            #train3[top_cols].loc[1972+5-4:2007-3-n_pred].rename(columns=do_pre('t4')).reset_index(drop=True),
            #train3[top_cols].loc[1972+5-3:2007-2-n_pred].rename(columns=do_pre('t3')).reset_index(drop=True),
            #train3[top_cols].loc[1972+5-2:2007-1-n_pred].rename(columns=do_pre('t2')).reset_index(drop=True),
            train3[top_cols].loc[1972+5-1:2007-0-n_pred].rename(columns=do_pre('t1')).reset_index(drop=True)
        ],
        axis=1)

def get_feat_targ(n_pred):
    features4 = get_features(n_pred = n_pred)
    target4 = train3[cols_target].loc[1972+5-1+n_pred:2007-0+n_pred]

    print(train3.shape, features4.shape, target4.shape)
    return features4, target4

In [None]:
features4, target4 = get_feat_targ(n_pred=5)

In [None]:
regr2012 = ensemble.RandomForestRegressor(n_estimators=100, min_samples_split=4, verbose=2, n_jobs=-1)

# Train the model using the training sets
print(time.ctime(), 'start')
regr2012.fit(features4, target4)
print(time.ctime(), 'done')

In [None]:
# Predict
features42 = pd.concat([
        #train3[top_cols].loc[2007-9].rename(columns=do_pre('t5')).reset_index(drop=True),
        #train3[top_cols].loc[2007-8].rename(columns=do_pre('t4')).reset_index(drop=True),
        #train3[top_cols].loc[2007-7].rename(columns=do_pre('t3')).reset_index(drop=True),
        #train3[top_cols].loc[2007-6].rename(columns=do_pre('t2')).reset_index(drop=True),
        train3[top_cols].loc[2007-5].rename(columns=do_pre('t1')).reset_index(drop=True)
    ],
    axis=1
)   

print(features4.shape, features42.shape)

df_pred1, df_pred2 = get_prediction(features42, regr2012, target4)
df_pred1.shape, df_pred2.shape

In [None]:
#df_submit[year2col(2012)+' (old)'] = df_submit[year2col(2012)]
df_submit[year2col(2012)] = df_pred2['value']
df_submit.head()

In [None]:
# feat impo when using top feature importances from RF 2008

In [None]:
# check feature importances
# The same when the original features are filtered for the top feature importances from
# 2008 regression 1-year prediction with 1-year back
regr2012.feature_importances_[regr2012.feature_importances_.argsort()[-10:]], features4.columns[regr2012.feature_importances_.argsort()[-10:]]

## Repeat 2008 prediction using 5-y history

In [None]:
features5, target5 = get_feat_targ(n_pred=1)

In [None]:
regr2008 = ensemble.RandomForestRegressor(n_estimators=100, min_samples_split=4, verbose=2, n_jobs=-1)

print(time.ctime(), 'start')
regr2008.fit(features5, target5)
print(time.ctime(), 'done')

In [None]:
# Predict
n_pred = 1
features52 = pd.concat([
        #train3[top_cols].loc[2007-4-n_pred].rename(columns=do_pre('t5')).reset_index(drop=True),
        #train3[top_cols].loc[2007-3-n_pred].rename(columns=do_pre('t4')).reset_index(drop=True),
        #train3[top_cols].loc[2007-2-n_pred].rename(columns=do_pre('t3')).reset_index(drop=True),
        #train3[top_cols].loc[2007-1-n_pred].rename(columns=do_pre('t2')).reset_index(drop=True),
        train3[top_cols].loc[2007-0-n_pred].rename(columns=do_pre('t1')).reset_index(drop=True)
    ],
    axis=1
)   

print(features5.shape, features52.shape)

df_pred1, df_pred2 = get_prediction(features52, regr2008, target5)
df_pred1.shape, df_pred2.shape

In [None]:
#df_submit[year2col(2008)+' (old)'] = df_submit[year2col(2008)]
df_submit[year2col(2008)] = df_pred2['value']
df_submit.head()

## compare to previous submission

In [None]:
df_previous = pd.read_csv('data/interim/submission_20180516_041403_score_0_05.csv').set_index('Unnamed: 0')
df_previous.head()

In [None]:
diff_2008 = df_submit.merge(df_previous, left_index=True, right_index=True)
diff_2008['diff 2008'] = diff_2008['2008 [YR2008]_x'] - diff_2008['2008 [YR2008]_y']
diff_2008['diff 2012'] = diff_2008['2012 [YR2012]_x'] - diff_2008['2012 [YR2012]_y']
diff_2008.head()

In [None]:
plt.plot(diff_2008.sort_values('diff 2008')['diff 2008'].values, 'r', label='2008')
plt.plot(diff_2008.sort_values('diff 2012')['diff 2012'].values, 'b', label='2012')
plt.legend()
plt.show()

In [None]:
target4.head()

In [None]:
target4.tail()

In [None]:
df_meta.loc[df_submit.head().index]

In [None]:
df_meta.reset_index().set_index(['Country Name', 'Series Code'])['Unnamed: 0']['Afghanistan']['2.1']

In [None]:
for country in ['Afghanistan', 'Poland']:
    for indicator in target4.columns:
        print(country, indicator)
        plt.scatter(
            pd.date_range('1981', '2007', freq='YS'),
            target4[indicator].loc[:,country],
            color='black'
        )

        to_plot = df_meta.reset_index().set_index(['Country Name', 'Series Code'])['Unnamed: 0']
        if country in to_plot.index:
            if indicator in to_plot[country].index:
                to_plot = to_plot[country][indicator]
                if to_plot in df_submit.index:
                    plt.scatter(
                        '2008-01-01',
                        df_submit.loc[to_plot, '2008 [YR2008]'],
                        color='red'
                    )
                    plt.scatter(
                        '2012-01-01',
                        df_submit.loc[to_plot, '2012 [YR2012]'],
                        color='red'
                    )

        plt.show()

## prepare submission file

In [None]:
df_submit3 = df_submit[['2008 [YR2008]', '2012 [YR2012]']].copy()
df_submit3 = df_submit3.fillna(value=0)
df_submit3 = df_submit3.reset_index()[['Unnamed: 0', '2008 [YR2008]', '2012 [YR2012]']]
df_submit3.rename(columns={'Unnamed: 0': ''}, inplace=True)
fn1, fn2 = make_submission(df_submit3)
fn1, fn2