In [None]:
# http://stackoverflow.com/questions/44544766/ddg#44547144
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
import numpy as np
import pandas as pd
import time
import seaborn as sns
from matplotlib import pyplot as plt
import keras

In [None]:
def pd_read_csv(fn:str):
    return pd.read_csv(fn, na_values=['NO_LABEL', '(blank)'])

## Peek at train

In [None]:
train = pd_read_csv('data_in/TrainingData.csv')
assert not train['Unnamed: 0'].duplicated().any()
train = train.set_index("Unnamed: 0")
train.shape

In [None]:
train.columns

In [None]:
train.head()

## Peek at test

In [None]:
test = pd_read_csv('data_in/TestData.csv')
assert not test['Unnamed: 0'].duplicated().any()
test = test.set_index("Unnamed: 0")
test.shape

In [None]:
test.columns

In [None]:
test.head()

## Diff columns

In [None]:
#train.loc[134338]
# pd.isnull(train['Function']).any()
# pd.isnull(test['Function']).any()
set(train.columns) - set(test.columns), set(test.columns) - set(train.columns)

# Some guessing

Function = Teacher Compensation   =>  Use = Instruction

In [None]:
set(train['Use'][train['Function']=='Teacher Compensation'].values)

Position Extra = TEACHER   =?=>  Function = Teacher Compensation

In [None]:
train['Function'][train['Position_Extra']=='TEACHER'].value_counts()

In [None]:
train['Position_Extra'][train['Function']=='Teacher Compensation'].value_counts().head()

In [None]:
train['Function'][train['Position_Extra']=='TEACHER'].shape, train.shape

In [None]:
test[test['Position_Extra']=='TEACHER'].shape, test.shape

## Read labels

In [None]:
import yaml
labels = yaml.load(open("labels.yml",'r'))

In [None]:
# Function': ['Aides Compensation
prediction_names = []
for k,v1 in labels.items():
    for v2 in v1:
        prediction_names.append("%s__%s"%(k,v2))
        
        
assert 'Function__Aides Compensation' in prediction_names
prediction_names[:5]

## Calculate counts

In [None]:
features = list(set(train.columns).intersection(set(test.columns)) - set(['FTE','Total']))
features.sort()
features

In [None]:
train[features].loc[134338].fillna('')

In [None]:
train['joined'] = train[features].fillna('').apply(lambda x: "~".join([y.replace(' ','').replace('"','') for y in x]), axis=1)

In [None]:
stats = train['joined'].value_counts()

In [None]:
stats.sort_values(ascending=False).head(n=15).reset_index()

In [None]:
stats = stats.sort_index()
stats.head()

In [None]:
k1=stats.index[10]
stats.index.get_loc(k1)

In [None]:
# plt.bar(x=range(stats.shape[0]), height=stats.values)
# n_pts = 500000
# plt.bar(x=range(n_pts), height=stats.iloc[:n_pts])
plt.plot(stats.sort_values(ascending=False).values.cumsum())
plt.title("%s, %s"%(stats.values.sum(), stats.shape[0]))
plt.show()

In [None]:
k1 = stats.sort_values(ascending=False).reset_index().loc[0]['index']

In [None]:
train[train['joined']==k1].shape, train.shape[0]

In [None]:
train[train['joined']==k1]['Function'].value_counts().reset_index()

In [None]:
import os

# The 1st version turned out to have indeces not in stats.index .. weird
# prob_fn = 'data_out/t1_probabilities_function.pkl'

# Fixing the above in the 2nd version
# prob_fn = 'data_out/t1_probabilities_function_v2.pkl'

# Set ".sort()" on features so that they're replicable
prob_fn = 'data_out/t1_probabilities_function_v3.pkl'

os.path.exists(prob_fn)

In [None]:
if os.path.exists(prob_fn):
    probabilities = pd.read_pickle(prob_fn)
else:
    probabilities = pd.DataFrame(
        np.zeros(shape=(len(labels['Function']), stats.shape[0])),
        columns=stats.index,
        index=labels['Function']
    )
    #probabilities.head(n=2)

    # k1 = stats.index[0]
    # probabilities[k1].update(train[train['joined']==k1]['Function'].value_counts())
    # probabilities[k1]

    n = len(stats.index)
    for i,k1 in enumerate(stats.index):
        if i % 1000 == 0: print("%s .. %s / %s"%(time.ctime(), i,n))
        probabilities[k1].update(train[train['joined']==k1]['Function'].value_counts())

    # save
    probabilities.to_pickle(prob_fn)

In [None]:
# k1 = stats.index[1]
# probabilities[k1], (probabilities / probabilities.sum(axis=0))[k1]
probabilities = probabilities / probabilities.sum(axis=0)

probabilities = probabilities.transpose()

probabilities = probabilities.sort_index()

probabilities.shape

In [None]:
assert set(stats.index) == set(probabilities.index)
assert len(set(stats.index) - set(probabilities.index)) == 0
assert len(set(probabilities.index) - set(stats.index)) == 0

## Bring back original set of columns
These are the fields that got joined with ~

In [None]:
vocabulary = train[~train.duplicated(['joined'])][features+['joined']].set_index('joined').sort_index()

In [None]:
vocabulary.head(n=2)

In [None]:
assert set(probabilities.index) == set(vocabulary.index)
assert len(set(probabilities.index) - set(vocabulary.index)) == 0
assert len(set(vocabulary.index) - set(probabilities.index)) == 0

In [None]:
# append "set_index" as recommended in pandas github issue 7632
# https://github.com/pandas-dev/pandas/issues/7632#issuecomment-316806258
# prob2 = probabilities.merge(vocabulary, left_index=True, right_on='joined', how='left').set_index('joined')
prob2 = probabilities.merge(vocabulary, left_index=True, right_index=True, how='left') # .set_index('joined')
probabilities.shape, prob2.shape # , prob2.head(n=2), prob2.head(n=2).index # , train.loc[70455]

In [None]:
prob2[features].sort_index().head(n=2)

In [None]:
print(time.ctime())
prob3 = prob2[features].apply(lambda x: pd.factorize(x)[0], axis=0)
prob3 = prob3 + 1 # +1 for the -1 (keras Embedding supports [0,N) )
print(time.ctime())

In [None]:
prob3.max().max(), prob3.min().min()

In [None]:
prob3.shape, probabilities.shape

In [None]:
prob3.max(axis=0)

## split hold-out

In [None]:
x = prob3
y = probabilities.fillna(0)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

## keras embedding + Dense/LSTM

In [None]:
vocab_size = prob3.max(axis=0) + 1 # +1 for the 0

In [None]:
#vocab_size.index, prob3.columns
vocab_size

In [None]:
probabilities.shape[1]

In [None]:
from keras.layers import Embedding, Dense, Flatten, LSTM, Input, Concatenate, Add, Lambda
from keras.models import Sequential, Model
from keras import backend as K

# vocab_size = stats.shape[0]

# inputs = [Input(shape=(prob3.shape[1],)) for f in vocab_size.index]
inputs = {f: Input(shape=(1,), name=f) for f in vocab_size.index}

# embeddings = [Embedding(vocab_size[f], embedding_dim, input_length=prob3.shape[1]) for f in vocab_size.index]

if False:
    embedding_dim = 12 # 2 # 64 # FIXME
    embeddings = [Embedding(vocab_size[f], embedding_dim, input_length=1)(inputs[f]) for f in vocab_size.index]
else:
    embeddings = [Embedding(vocab_size[f], vocab_size[f]//15, input_length=1)(inputs[f]) for f in vocab_size.index]

# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, input_length, embedding_dim), where None is the batch dimension.

x1 = Concatenate()(embeddings)

x1 = Flatten()(x1)
x1 = Dense(1000)(x1)
o1 = Dense(probabilities.shape[1], activation = 'sigmoid', name='pred_prob')(x1)
o2 = Lambda(lambda x: K.sum(x, axis=1, keepdims=True), name='sum_prob')(o1)
outputs = [o1, o2]

inputs = [inputs[f] for f in vocab_size.index]
model = Model(inputs=inputs, outputs=outputs)
model.compile('rmsprop', 'mse')
model.summary()

In [None]:
model.fit(
    # pd.get_dummies(train3['x'].values),
    # # train2[list(set(train2.columns) - set(['joined']))],
    # train3['y'].values,
    [x_train[f].values for f in vocab_size.index],
    [y_train, np.ones(shape=(y_train.shape[0],1))],
    epochs=100,
    verbose=2,
    validation_split = 0.2,
    shuffle=False
)

In [None]:
model.evaluate([x_test[f].values for f in vocab_size.index], [y_test, np.ones(shape=(y_test.shape[0],1))])

## argmax accuracy

In [None]:
my_score = np.zeros(y_test.shape[0], dtype='uint8')
y_pred, sum_pred = model.predict([x_test[f].values for f in vocab_size.index])
for i in range(y_test.shape[0]):
    v1 = y_test.iloc[i].idxmax()
    v2 = probabilities.columns[np.argmax(y_pred[i])]
    my_score[i] = 1 if (v1 == v2) else 0

In [None]:
sum(my_score), my_score.shape[0]

## Spatial comparison

In [None]:
for i in range(probabilities.shape[1]):
    n_show = 1000
    y_pred, sum_pred = model.predict([x_test[f].values[:n_show] for f in vocab_size.index])

    plt.figure(figsize=(20,3))
    plt.plot(y_pred[:n_show,i], label='pred')
    plt.plot(sum_pred, label='sum_pred', alpha=0.2)
    plt.plot(y_test.iloc[:n_show,i].values, '.', label='actual')
    plt.legend(loc='best')
    plt.title(y_test.columns[i])
    
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    plt.show()

In [None]:
y_test.iloc[0].sum(), y_pred[0].sum() # , y_pred[0]

## temporal comparison

In [None]:
for i in range(5):
    plt.figure(figsize=(10,3))
    
    plt.subplot(121)
    plt.bar(x=range(y_pred.shape[1]), height=y_test.iloc[i].values)    
    plt.title('prediction')
    
    plt.subplot(122)
    plt.bar(x=range(y_pred.shape[1]), height=y_pred[i])    
    plt.title('actual')
    
    # plt.title(y_test.index[i])
    
    plt.show()

## RF

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

regr = RandomForestRegressor(max_depth=None, random_state=0, verbose=2, n_jobs=-1)


regr.fit(x_train, y_train)

In [None]:
print(prob3.columns, regr.feature_importances_)

In [None]:
print(regr.score(x_test, y_test))

In [None]:
y_pred_rf = regr.predict(x_test[:3]) # .round(1)

In [None]:
y_pred_rf.sum(axis=1), y_pred_rf[0], y_test[:3].values