Same as 0.4 with linear regression, but here using Random Forest

In [None]:
import pandas as pd
import os
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import time

## Read data

In [None]:
from src.features import *
train = load_raw()
cols = imply_columns(train)

In [None]:
assert not train['Unnamed: 0'].duplicated().any()

In [None]:
train = train.set_index('Unnamed: 0')

In [None]:
train.shape, train.columns

In [None]:
# pd.isnull(train[cols['features']]).sum(axis=1).sort_values(ascending=True).head(n=20)
train.shape[0], len(np.where(pd.isnull(train[cols['features']]).sum(axis=1) < 5)[0])
# train.iloc[0,'1972 [YR1972]']
# pd.isnull(train.loc[0,'1972 [YR1972]'])
# train.head()
# train.loc[131876]

## read submission file

In [None]:
df_submit = pd.read_csv('data/raw/submission.csv').set_index('Unnamed: 0')
df_submit.head()

In [None]:
len(set(train.loc[df_submit.index]['Country Name']))

In [None]:
set(train.loc[df_submit.index]['Series Code'])

In [None]:
submit_inds = list(set(df_submit.index))
submit_inds.sort()
submit_inds[:10]

## preprocess data to backfill nan

In [None]:
my_rename = {x: x[6:-1] for x in cols['features']}
train.rename(columns=my_rename, inplace=True)

In [None]:
cols['features'] = list(my_rename.values())
cols['features'].sort()
cols['features'][:5]

In [None]:
# wide to long format so that years are an index along the rows
train = pd.wide_to_long(train.reset_index(), ["YR"], i="Unnamed: 0", j="year").reset_index(level='year')

In [None]:
# rename YR to "value"
train.rename(columns={'YR': 'value'}, inplace=True)

In [None]:
train['year'] = train['year'].apply(lambda x: int(x))
train['year'].head().values

In [None]:
# plot again just to confirm that wide-to-long worked
country = 'Kenya' # 'Poland'
series = 'Net taxes on products (current LCU)'
subtrain = train[(train['Country Name']==country) & (train['Series Name']==series)]
subtrain = subtrain.set_index('year')
subtrain.head()

In [None]:
subtrain['value'].head()

In [None]:
plt.plot(subtrain['value'])
plt.title("%s: %s"%(country, series))
plt.show()

## group and backfill

In [None]:
train.shape

In [None]:
# test
df = pd.DataFrame([[np.nan, 2, np.nan, 0, 2, 2],
                   [3, 4, np.nan, 1, 1, 2],
                   [np.nan, np.nan, np.nan, 5, 1, 3],
                   [np.nan, 3, np.nan, 4, 1, 4],
                  ],
                  columns=list('ABCDEF'))
df['B'] = df.sort_values('F', ascending=True).groupby(['E'])['B'].fillna(method='ffill')
df

In [None]:
train.shape

In [None]:
n_nan_before = pd.isnull(train['value']).sum()
print('before: number of nan: %s'%n_nan_before)

In [None]:
# around 15 seconds
print(time.ctime(), 'start group')
train2 = train.copy()
train2 = train2.sort_values(['Country Name', 'Series Code', 'year'], ascending=True)
t_group = train2.groupby(['Country Name', 'Series Code'])
print(time.ctime(), 'end group')

In [None]:
# around 1 minute
print(time.ctime(), 'start fill')
train2['value'] = t_group['value'].fillna(method='ffill')
print(time.ctime(), 'end fill')

In [None]:
n_nan_after = pd.isnull(train2['value']).sum()
print('after: number of nan: %s'%n_nan_after)

## Fill remaining nan with bfill

In [None]:
# group again and backfill this time ... this is kind of cheating
# ~ 1 minute
print(time.ctime(), 'start fill')
t_group = train2.groupby(['Country Name', 'Series Code'])
train2['value'] = t_group['value'].fillna(method='bfill')
print(time.ctime(), 'end fill')
n_nan_after2 = pd.isnull(train2['value']).sum()
print('after: number of nan: %s'%n_nan_after2)

In [None]:
train2.reset_index().head()

## pivot table

- temporal dimension: year
- spatial dimension: country/series pair

This results in a transpose of the original dataframe

In [None]:
train3 = train2.reset_index().pivot(index='year', columns='Unnamed: 0', values='value')
train3.head()

In [None]:
train3[[16,559]].tail()

In [None]:
train3.shape

## <strike>multi-variate OLS</strike> Random Forest

In [None]:
from sklearn import ensemble
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.feature_selection import SelectKBest, f_regression
# from sklearn.decomposition import PCA

# Method 1: target is a single country/series pair, features are all else
# diabetes_X = diabetes_0[list(set(diabetes_0.columns) - set([index]))].values
# diabetes_y = train3[index].values

# Method 2: features are lagged data (drop 1st row since cannot predict)
# Filter for last 7 points
n_shift=1
n_last = 7
n_focus = 10
diabetes_X = train3[submit_inds].shift(n_shift).iloc[n_shift:].tail(n_last)
diabetes_y = train3[submit_inds].iloc[n_shift:, :n_focus].tail(n_last)

# prepare append year to features
year1 = diabetes_X.reset_index()[['year']].values
print(year1.squeeze())

# to avoid multi-colinearity, reduce features

# Method 1: select top n features
# http://scikit-learn.org/stable/auto_examples/plot_compare_reduction.html#sphx-glr-auto-examples-plot-compare-reduction-py
# This beats the performance of the PCA
# mdl = SelectKBest(f_regression, k=40) # 20 and 40 are good
# diabetes_X = mdl.fit_transform(diabetes_X, diabetes_y)

# Method 2: PCA to reduce features
# # mdl = PCA(n_components='mle') # TODO report bug
# # mdl = PCA(n_components='mle', svd_solver = 'full') # mle not supported for n_samples < n_features
# mdl = PCA(n_components=20, svd_solver = 'full') # 20 is too small resulting in too large error
# diabetes_X = mdl.fit_transform(diabetes_X)

# execute append year to features
print('diabetes_X.shape, year1.shape', diabetes_X.shape, year1.shape)
diabetes_X = np.concatenate([diabetes_X, year1], axis=1)

# Split the data into training/testing sets
n_test = 2 # FIXME: 0 2
if n_test > 0:
    diabetes_X_train = diabetes_X[:-1*n_test]
    diabetes_X_test = diabetes_X[-1*n_test:]
else:
    diabetes_X_train = diabetes_X

# Split the targets into training/testing sets
if n_test > 0:
    diabetes_y_train = diabetes_y.iloc[:-1*n_test]
    diabetes_y_test = diabetes_y.iloc[-1*n_test:]
else:
    diabetes_y_train = diabetes_y

# Create linear regression object
# regr = linear_model.LinearRegression()
regr = ensemble.RandomForestRegressor(n_estimators=100, min_samples_split=4, verbose=0, n_jobs=-1)

# Train the model using the training sets
print('fit', diabetes_X_train.shape, diabetes_y_train.shape)
regr.fit(diabetes_X_train, diabetes_y_train)


In [None]:
def year2col(x):
    return "%.0f [YR%.0f]"%(x, x)

year2col(2008), year2col(2012)

In [None]:
df_submit2 = df_submit.copy()

# df_submit2[year2col(2008)] = np.nan
df_submit2[year2col(2009)] = np.nan
df_submit2[year2col(2010)] = np.nan
df_submit2[year2col(2011)] = np.nan
# df_submit2[year2col(2012)] = np.nan

if n_test > 0:
    # re-predict since first test point
    n_pred2 = 1
    diabetes_y_pred2 = diabetes_y_test.iloc[:n_pred2, :n_focus]
    year2 = diabetes_y_test.reset_index()[['year']].iloc[:n_pred2]
else:
    # take last point as starting point of predictions
    # (no need to drop year column since using y_train and not X_train)
    diabetes_y_pred2 = diabetes_y_train[-1:]
    year2 = train3.reset_index()[['year']][n_shift:][-1:]

print('base year', year2)
for i_pred2 in range(5):
    diabetes_y_pred2 = np.concatenate([diabetes_y_pred2, year2-1+i_pred2], axis=1)
    diabetes_y_pred2 = regr.predict(diabetes_y_pred2)
    print('set year', year2.values[0]+1+i_pred2)
    df_submit2.loc[:,year2col(year2.values[0]+1+i_pred2)] = diabetes_y_pred2.transpose()


for index in submit_inds[:10]: #[559, 618]:
    # print(train3.index[1:][:-1*n_test].shape, diabetes_y_train.shape, diabetes_y_train[:,submit_inds.index(index)].squeeze().shape)
    
    # Plot outputs
    if n_test > 0:
        plt.plot(diabetes_y_train[index], '.', color='black')
        plt.plot(diabetes_y_test[index], '.', color='green', alpha=0.5)
        plt.scatter(train3.index[n_shift:][-1*n_test:], diabetes_y_pred[:,submit_inds.index(index)].squeeze(), color='red', alpha=0.5)
    else:
        plt.scatter(train3.index[n_shift:], diabetes_y_train[:,submit_inds.index(index)].squeeze(), color='black')
        
    # print(year2)
    for i_pred2 in range(5):
        plt.scatter(year2+1+i_pred2, df_submit2.loc[index, year2col(year2.values[0]+1+i_pred2)].squeeze(), color='orange', alpha=0.5)

    plt.title(index)
    plt.show()


In [None]:
x=[1,2,3,4,5]
x[:-1], x[-1:], x[:-0], x[-0:], x[:1]

In [None]:
df_submit2.head()

## prepare submission file

In [None]:
df_submit3 = df_submit2[['2008 [YR2008]', '2012 [YR2012]']].copy()
df_submit3 = df_submit3.fillna(value=0)
df_submit3 = df_submit3.reset_index()[['Unnamed: 0', '2008 [YR2008]', '2012 [YR2012]']]
df_submit3.rename(columns={'Unnamed: 0': ''}, inplace=True)
fn1, fn2 = make_submission(df_submit3)
fn1, fn2