In [None]:
# http://stackoverflow.com/questions/44544766/ddg#44547144
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
import numpy as np
import pandas as pd
import time
import seaborn as sns
from matplotlib import pyplot as plt
import keras

In [None]:
def pd_read_csv(fn:str):
    x = pd.read_csv(fn, na_values=['NO_LABEL', '(blank)'])
    assert not x['Unnamed: 0'].duplicated().any()
    x = x.set_index("Unnamed: 0")
    return x

## Read data

In [None]:
train = pd_read_csv('data_in/TrainingData.csv')
test = pd_read_csv('data_in/TestData.csv')

In [None]:
# train.columns
train['Position_Type'].head()

In [None]:
features = list(set(train.columns).intersection(set(test.columns)) - set(['FTE','Total']))
features.sort()
features

In [None]:
target = set(train.columns) - set(test.columns)
target = list(target)
target.sort()
target

In [None]:
for col in target:
    test[col] = np.nan

In [None]:
train.shape, test.shape

In [None]:
train['is_holdout'] = False
test ['is_holdout'] = True
df = pd.concat([train,test], axis=0)
df.shape

In [None]:
meta = list(set(df.columns) - set(features) - set(target))
meta

In [None]:
df.shape, df[features].shape, df[target].shape, df[meta].shape

## Read labels

In [None]:
import yaml
labels = yaml.load(open("labels.yml",'r'))

In [None]:
# Function': ['Aides Compensation
prediction_names = []
for k,v1 in labels.items():
    for v2 in v1:
        pn = "%s__%s"%(k,v2)
        prediction_names.append(pn)
        
        
assert 'Function__Aides Compensation' in prediction_names
prediction_names.sort()
prediction_names[:5]

## Convert target to label-like columns

In [None]:
for p in prediction_names: df[p] = 0

In [None]:
for k,v1 in labels.items():
    for v2 in v1:
        pn = "%s__%s"%(k,v2)
        # print(pn)
        df[pn] = df[k] == v2

In [None]:
# since NO_LABEL is replaced with NaN, need this
for dependent in labels.keys():
    target_sub = [x for x in df.columns if x.startswith("%s__"%dependent)]
    df.loc[~df[target_sub].any(axis=1), '%s__NO_LABEL'%dependent]=True

In [None]:
df[['Function', 'Function__Teacher Compensation', 'Function__Substitute Compensation', 'Function__NO_LABEL']].head()

In [None]:
df.shape, df[pd.isnull(df[prediction_names]).all(axis=1)].shape, df.loc[~df[prediction_names].any(axis=1)].shape

In [None]:
assert ~pd.isnull(df[prediction_names]).any().any()

In [None]:
df[prediction_names] = df[prediction_names].astype('uint8')

## Factorize features

In [None]:
print(time.ctime())
df2 = df[features].apply(lambda x: pd.factorize(x)[0], axis=0)
df2 = df2 + 1 # +1 for the -1 (keras Embedding supports [0,N) )    
print(time.ctime())

In [None]:
df2.max().max(), df2.min().min()

In [None]:
vocab_size = df2.max(axis=0) + 1 # +1 for the 0
vocab_size = vocab_size.sort_index()
vocab_size

In [None]:
assert df[prediction_names].max().max()==1

## split hold-out

In [None]:
x = df2[~df['is_holdout']]
y = df[prediction_names][~df['is_holdout']] # .fillna(0)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)


## keras embedding + Dense/LSTM

In [None]:
# make array of features
x_train = [x_train[f].values for f in vocab_size.index]
x_test  = [x_test [f].values for f in vocab_size.index]

In [None]:
label_keys = labels.keys()
label_keys = list(label_keys)
label_keys.sort()
y_train = [y_train[[x for x in prediction_names if x.startswith("%s__"%f)]].values for f in label_keys]
y_test  = [y_test [[x for x in prediction_names if x.startswith("%s__"%f)]].values for f in label_keys]

In [None]:
len(y_train), y_train[0].shape, y_train[1].shape

In [None]:
from keras.layers import Embedding, Dense, Flatten, LSTM, Input, Concatenate, Add, Lambda
from keras.models import Sequential, Model
from keras import backend as K

# vocab_size = stats.shape[0]

# inputs = [Input(shape=(prob3.shape[1],)) for f in vocab_size.index]
inputs = {f: Input(shape=(1,), name=f) for f in vocab_size.index}

# embeddings = [Embedding(vocab_size[f], embedding_dim, input_length=prob3.shape[1]) for f in vocab_size.index]

if True:
    embedding_dim = 3 # 12 # 2 # 64 # FIXME
    embeddings = [Embedding(vocab_size[f], embedding_dim, input_length=1)(inputs[f]) for f in vocab_size.index]
else:
    embeddings = [Embedding(vocab_size[f], max(3, vocab_size[f]//15//10), input_length=1)(inputs[f]) for f in vocab_size.index]

# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, input_length, embedding_dim), where None is the batch dimension.

x1 = Concatenate()(embeddings)

x1 = Flatten()(x1)
#x1 = Dense( 500, activation='relu')(x1)
# x1 = Dense(  50, activation='relu')(x1)

# x1 = Dense(  50, activation='relu')(x1)
# x1 = Dense(  50, activation='relu')(x1)

x1 = Dense( 50, activation='relu')(x1)
x1 = Dense( 50, activation='relu')(x1)

o1 = {dependent: Dense(50, activation = 'relu', name="%s_d1"%dependent)(x1) for dependent in label_keys}
o1 = {dependent: Dense(50, activation = 'relu', name="%s_d2"%dependent)(o1[dependent]) for dependent in label_keys}

outputs = [Dense(len(labels[dependent]), activation = 'softmax', name="%s_out"%dependent)(o1[dependent]) for dependent in label_keys]

inputs = [inputs[f] for f in vocab_size.index]
model = Model(inputs=inputs, outputs=outputs)
model.summary()

In [None]:
def multi_multiclass_logloss(y_true, y_pred):
    # when statring to use Function and others, use this
    # return K.mean(-1*K.mean(K.batch_dot(y_true, K.log(y_pred)), axis=-1), axis=-1)
    # _epsilon = K.epsilon() * K.ones_like(y_true)
    y_pred2 = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    output = -1*K.mean(K.batch_dot(K.transpose(y_true), K.log(y_pred2), axes=[0,1]), axis=-1)
    return output

# model.compile('rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.compile('rmsprop', loss=multi_multiclass_logloss, metrics=['acc'])

In [None]:
# test K.batch_dot
# Need to specify axes due to issue
# https://github.com/keras-team/keras/issues/9847
x_batch = K.ones(shape=(32, 37))
y_batch = K.ones(shape=(32, 37))
# xy_batch_dot = K.batch_dot(K.transpose(x_batch), y_batch, axes=[0, 1])
xy_batch_dot = multi_multiclass_logloss(x_batch, y_batch)
K.int_shape(xy_batch_dot), K.eval(xy_batch_dot)

In [None]:
# from keras.utils import to_categorical
# y_binary = to_categorical(np.argmax(y_train.values, axis=1))
# y_binary = np.argmax(y_train.values, axis=1).squeeze()

model.fit(
    # pd.get_dummies(train3['x'].values),
    # # train2[list(set(train2.columns) - set(['joined']))],
    # train3['y'].values,
    x_train,
    y_train,
    batch_size=32, #32*32, # 32, # FIXME
    epochs=300,
    verbose=2,
    validation_split = 0.2,
    shuffle=False
)

In [None]:
model.evaluate(x_test, y_test)

## argmax accuracy

In [None]:
my_score = np.zeros(y_test.shape[0], dtype='uint8')
# y_pred, sum_pred = model.predict([x_test[f].values for f in vocab_size.index])
y_pred = model.predict([x_test[f].values for f in vocab_size.index])
for i in range(y_test.shape[0]):
    v1 = y_test.iloc[i].idxmax()
    v2 = probabilities.columns[np.argmax(y_pred[i])]
    my_score[i] = 1 if (v1 == v2) else 0

In [None]:
sum(my_score), my_score.shape[0], sum(my_score)*100 // my_score.shape[0]

## Spatial comparison

In [None]:
for i in range(min(10,probabilities.shape[1])):
    n_show = 1000
    y_pred = model.predict(x_test)
    y_pred2 = y_pred[0]
    y_test2 = y_test[0]

    plt.figure(figsize=(20,3))
    plt.plot(y_pred2[:n_show,i], label='pred')
    #plt.plot(sum_pred, label='sum_pred', alpha=0.2)
    plt.plot(y_test2[:n_show,i], '.', label='actual')
    plt.legend(loc='best')
    plt.title(probabilities.columns[i])
    
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    plt.show()

In [None]:
y_test.iloc[0].sum(), y_pred[0].sum() # , y_pred[0]

## temporal comparison

In [None]:
y_pred = model.predict(x_test)
y_pred = {label_keys[i]: y_pred[i] for i in range(len(label_keys))}

k2 = 'Function'
y_pred2 = y_pred[k2]
y_test2 = y_test[0]
for i in range(15):
    plt.figure(figsize=(10,3))
    
    plt.subplot(121)
    plt.bar(x=range(y_pred2.shape[1]), height=y_test2[i])
    plt.title('%s. actual, argmax=%s'%(i,np.argmax(y_test2[i])))
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    
    plt.subplot(122)
    plt.bar(x=range(y_pred2.shape[1]), height=y_pred2[i])
    plt.title('%s. prediction, argmax=%s'%(i,np.argmax(y_pred2[i])))
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    
    # plt.title(y_test.index[i])
    
    plt.show()

In [None]:
i=6
y_test.iloc[i].values

## Prepare submission

In [None]:
print(test.shape)
test['joined'] = calc_joined(test)
print(test.shape)

In [None]:
test.shape, test[test['joined'].isin(train['joined'])].shape

In [None]:
test[features].shape

In [None]:
test[features].head(n=2)

In [None]:
test['joined'].head(n=2)

In [None]:
train[train['Sub_Object_Description']=="Line Item that is paid with Campus' money"].shape

## RF