In [None]:
# http://stackoverflow.com/questions/44544766/ddg#44547144
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
import numpy as np
import pandas as pd
import time
import seaborn as sns
from matplotlib import pyplot as plt
import keras

In [None]:
def pd_read_csv(fn:str):
    x = pd.read_csv(fn, na_values=['NO_LABEL', '(blank)'])
    assert not x['Unnamed: 0'].duplicated().any()
    x = x.set_index("Unnamed: 0")
    return x

## Read data

In [None]:
train = pd_read_csv('data_in/TrainingData.csv')
test = pd_read_csv('data_in/TestData.csv')

In [None]:
# train.columns
train['Position_Type'].head()

In [None]:
features = list(set(train.columns).intersection(set(test.columns)) - set(['FTE','Total']))
features.sort()
features

In [None]:
target = set(train.columns) - set(test.columns)
target = list(target)
target.sort()
target

In [None]:
for col in target:
    test[col] = np.nan

In [None]:
train.shape, test.shape

In [None]:
train['is_holdout'] = False
test ['is_holdout'] = True
df = pd.concat([train,test], axis=0)
df.shape

In [None]:
meta = list(set(df.columns) - set(features) - set(target))
meta

In [None]:
df.shape, df[features].shape, df[target].shape, df[meta].shape

## Analyze how close the train and test features are

In [None]:
results = []
for ff in features:
    vc_train = train[ff].value_counts()
    vc_test  = test [ff].value_counts()
    # vc_train.shape, vc_test.shape
    vc_both  = vc_train.reset_index().merge(
        vc_test.reset_index(), 
        left_on = 'index', 
        right_on='index', 
        how='outer', 
        suffixes=['_train', '_test']
    )
    vc_both = vc_both.set_index('index')
    # vc_both.head()
    # vc_both[pd.isnull(vc_both['Facility_or_Department_test'])].head()
    out = {
        'feature': ff,
        'train all': train.shape[0],
        # 'train': vc_both['%s_train'%ff].sum(),
        'train non-null': (~pd.isnull(train[ff])).sum(),
        'train_minus_test': vc_both['%s_train'%ff][pd.isnull(vc_both['%s_test'%ff ])].sum(), 
        'test_minus_train': vc_both['%s_test'%ff ][pd.isnull(vc_both['%s_train'%ff])].sum(),
    }
    out['tmt_pct'] = out['test_minus_train'] * 100 // out['train non-null']
    results.append(out)


results = pd.DataFrame(results)
results = results.set_index('feature').sort_index()
results = results.astype('uint32')

In [None]:
# results.shape
# results.head()
results[['train all', 'train non-null', 'train_minus_test', 'test_minus_train', 'tmt_pct']]

In [None]:
# sod = train['Sub_Object_Description'].value_counts()
sod = test['Sub_Object_Description'].value_counts()

In [None]:
sod.head(n=20)

In [None]:
from matplotlib import pyplot as plt
# plt.bar(x=range(sod.shape[0]), height=sod.values)
plt.bar(x=range(sod.shape[0]-5), height=sod.iloc[5:].values)
plt.show()

In [None]:
# sod[sod<10].shape[0], sod.shape[0]
sod[sod<10]

In [None]:
subtest = test['Sub_Object_Description'].apply(lambda x: (~pd.isnull(x)) & ('community' in str(x).lower())) # .sum()

In [None]:
test['Sub_Object_Description'][subtest].head()

## Read target labels

In [None]:
import yaml
labels = yaml.load(open("labels.yml",'r'))

In [None]:
# Function': ['Aides Compensation
prediction_names = []
for k,v1 in labels.items():
    for v2 in v1:
        pn = "%s__%s"%(k,v2)
        prediction_names.append(pn)
        
        
assert 'Function__Aides Compensation' in prediction_names
prediction_names.sort()
prediction_names[:5]

## one-hot encode each target by its classes

In [None]:
for p in prediction_names: df[p] = False

In [None]:
for k,v1 in labels.items():
    for v2 in v1:
        pn = "%s__%s"%(k,v2)
        # print(pn)
        df[pn] = df[k] == v2

In [None]:
# since NO_LABEL is replaced with NaN, need this
for dependent in labels.keys():
    target_sub = [x for x in df.columns if x.startswith("%s__"%dependent)]
    df.loc[~df[target_sub].any(axis=1), '%s__NO_LABEL'%dependent]=True

In [None]:
df[['Function', 'Function__Teacher Compensation', 'Function__Substitute Compensation', 'Function__NO_LABEL']].head()

In [None]:
df.shape, df[pd.isnull(df[prediction_names]).all(axis=1)].shape, df.loc[~df[prediction_names].any(axis=1)].shape

In [None]:
assert ~pd.isnull(df[prediction_names]).any().any()

In [None]:
df[prediction_names] = df[prediction_names].astype('uint8')

## Factorize features

In [None]:
print(time.ctime())
df_feat = df[features].apply(lambda x: pd.factorize(x)[0], axis=0)
df_feat = df_feat + 1 # +1 for the -1 from pd.factorize on nan (keras Embedding supports [0,N) )    
print(time.ctime())

In [None]:
df_feat.max().max(), df_feat.min().min()

In [None]:
vocab_size = df_feat.max(axis=0) + 1 # +1 to count the 0 index
vocab_size = vocab_size.sort_index()
vocab_size

In [None]:
assert df[prediction_names].max().max()==1

## split the non-holdout into train/test

In [None]:
x = df_feat[~df['is_holdout']]
y = df[prediction_names][~df['is_holdout']] # .fillna(0)

from sklearn.model_selection import train_test_split
test_size=0.33
# test_size=0
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
# calculate label_keys array whose order is replicable
label_keys = labels.keys()
label_keys = list(label_keys)
label_keys.sort()


## build a dummy equi-probable target

In [None]:
y_equi = {}
for k in label_keys:
    y_equi[k] = np.ones(shape=(y_train.shape[0], len(labels[k]))) / len(labels[k])

y_equi = [y_equi[k] for k in label_keys]
y_equi = np.concatenate(y_equi, axis=1)
y_equi = pd.DataFrame(y_equi, columns=y_train.columns, index=y_train.index)
y_equi.shape

In [None]:
y_equi.head()
# y_equi[0].head()

## keras embedding + Dense/LSTM

In [None]:
# convert 2-D matrix of features into array of 1-D features
# This is needed because each feature has a different vocabulary for its embedding
x_train = [x_train[f].values for f in vocab_size.index]
x_test  = [x_test [f].values for f in vocab_size.index]

In [None]:

# convert 2-D matrix of targets into K arrays of C-D matrices 
# where C is the number of classes of each target
y_train = [y_train[[x for x in prediction_names if x.startswith("%s__"%f)]].values for f in label_keys]
y_test  = [y_test [[x for x in prediction_names if x.startswith("%s__"%f)]].values for f in label_keys]
y_equi  = [y_equi [[x for x in prediction_names if x.startswith("%s__"%f)]].values for f in label_keys]

In [None]:
len(y_train), y_train[0].shape, y_train[1].shape, len(y_test), y_test[0].shape, len(y_equi), y_equi[0].shape

In [None]:
y_equi[0][:2], y_equi[1][:2]

In [None]:
from keras.layers import Embedding, Dense, Flatten, LSTM, Input, Concatenate, Add, Lambda, Dropout
from keras.models import Sequential, Model
from keras import backend as K

# vocab_size = stats.shape[0]

# inputs = [Input(shape=(prob3.shape[1],)) for f in vocab_size.index]
inputs = {f: Input(shape=(1,), name=f) for f in vocab_size.index}

# embeddings = [Embedding(vocab_size[f], embedding_dim, input_length=prob3.shape[1]) for f in vocab_size.index]

if True:
    embedding_dim = 10 # 3 # 12 # 2 # 64 # FIXME
    embeddings = {f: Embedding(vocab_size[f], embedding_dim, input_length=1)(inputs[f]) for f in vocab_size.index}
else:
    embeddings = {f: Embedding(vocab_size[f], max(3, vocab_size[f]//15//10), input_length=1)(inputs[f]) for f in vocab_size.index}

# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, input_length, embedding_dim), where None is the batch dimension.

# dummy variable
x1= embeddings

#  flatten each feature since no sequences anyway
x1 = {f: Flatten(name="%s_flat"%f)(x1[f]) for f in vocab_size.index}

# dense layer for each feature
# x1 = {f: Dense(10, activation = 'relu', name="%s_d01"%f)(x1[f]) for f in vocab_size.index}
# x1 = {f: Dense( 3, activation = 'relu', name="%s_d02"%f)(x1[f]) for f in vocab_size.index}

# a dropout for each feature, this way, the network is more robust to dependencies on a single feature
x1 = {f: Dropout(0.3, name="%s_dropout"%f)(x1[f]) for f in vocab_size.index}

x1 = [x1[f] for f in vocab_size.index]
x1 = Concatenate()(x1)
# x1 = Flatten()(x1)
x1 = Dropout(0.3)(x1)

x1 = Dense(1000, activation='relu')(x1)
x1 = Dense( 300, activation='relu')(x1)

# x1 = Dense( 50, activation='relu')(x1)
# o1 = {dependent: Dense(50, activation = 'relu', name="%s_d1"%dependent)(x1) for dependent in label_keys}
# o1 = {dependent: Dense(50, activation = 'relu', name="%s_d2"%dependent)(o1[dependent]) for dependent in label_keys}

# outputs = [Dense(len(labels[dependent]), activation = 'softmax', name="%s_out"%dependent)(o1[dependent]) for dependent in label_keys]
outputs = [Dense(len(labels[dependent]), activation = 'softmax', name="%s_out"%dependent)(x1) for dependent in label_keys]

inputs = [inputs[f] for f in vocab_size.index]
model = Model(inputs=inputs, outputs=outputs)
model.summary()

In [None]:
# model.compile('rmsprop', loss=multi_multiclass_logloss, metrics=['acc'])
# model.compile('rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.compile('adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
# first train to equi-probable

model.fit(
    x_train,
    y_equi,
    batch_size=32*32, # 32, # FIXME
    epochs=5,
    verbose=2,
    validation_split = 0.2,
    # validation_split = 0,
    shuffle=True
)

In [None]:
# y_pred = model.predict(x_train, batch_size=32*32)
y_pred = model.predict(x_test,  batch_size=32*32)

In [None]:
y_pred[0][0,:5]

In [None]:
# evaluate on the real data
model.evaluate(x_train, y_train, batch_size = 32*32)

In [None]:
# then train to actual probabilities
history = model.fit(
    # pd.get_dummies(train3['x'].values),
    # # train2[list(set(train2.columns) - set(['joined']))],
    # train3['y'].values,
    x_train,
    y_train,
    batch_size=32*32, # 32, # FIXME
    epochs=30,
    #initial_epoch=30,
    verbose=2,
    validation_split = 0.2,
    # validation_split = 0,
    shuffle=True
)

In [None]:
from matplotlib import pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()

In [None]:
model.evaluate(x_test, y_test, batch_size = 32*32)

## argmax accuracy

## Spatial comparison

In [None]:
# sub_labels = labels
sub_labels = {k:labels[k] for k in label_keys if k in ['Function']}

n_show = 1000
y_pred = model.predict(x_test)

In [None]:
for i,v0 in enumerate(sub_labels.items()):
    k,v1 = v0
    y_pred2 = pd.DataFrame(y_pred[i], columns=v1)
    y_test2 = pd.DataFrame(y_test[i], columns=v1)
    for v2 in v1:
        plt.figure(figsize=(20,3))
        plt.plot(y_pred2.loc[:n_show,v2], label='pred')
        #plt.plot(sum_pred, label='sum_pred', alpha=0.2)
        plt.plot(y_test2.loc[:n_show,v2], '.', label='actual')
        plt.legend(loc='best')
        plt.title("%s: %s"%(k,v2))

        axes = plt.gca()
        axes.set_ylim([-.1,1.1])
        plt.show()

In [None]:
y_test[0][0,:].sum(), y_pred[0][0,:].sum() # , y_pred[0]

## temporal comparison

In [None]:
y_pred = model.predict(x_test)
y_pred2 = {label_keys[i]: y_pred[i] for i in range(len(label_keys))}
y_test2 = {label_keys[i]: y_test[i] for i in range(len(label_keys))}

k2 = 'Function'
y_pred3 = y_pred2[k2]
y_test3 = y_test2[k2]

for i in range(15):
    plt.figure(figsize=(10,3))
    
    plt.subplot(121)
    plt.bar(x=range(y_pred3.shape[1]), height=y_test3[i])
    plt.title('%s. actual, argmax=%s'%(i,np.argmax(y_test3[i])))
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    
    plt.subplot(122)
    plt.bar(x=range(y_pred3.shape[1]), height=y_pred3[i])
    plt.title('%s. prediction, argmax=%s'%(i,np.argmax(y_pred3[i])))
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    
    # plt.title(y_test.index[i])
    
    plt.show()

In [None]:
i=6
y_test3[i,:]

## Mock submission

In [None]:
x_ho = df_feat[features][~df['is_holdout']].head()
x_ho  = [x_ho [f].values for f in vocab_size.index]
y_ho = model.predict(x_ho)
df_submit = pd.DataFrame(np.concatenate(y_ho, axis=1), columns=prediction_names)
df_submit.shape

In [None]:
df_submit.head()

In [None]:
df[target].head()

## Prepare submission

In [None]:
df.shape, df_feat.shape

In [None]:
x_ho = df_feat[features][ df['is_holdout']]
x_ho  = [x_ho [f].values for f in vocab_size.index]
y_ho = model.predict(x_ho)

In [None]:
len(y_ho), y_ho[0].shape, y_ho[1].shape

In [None]:
df_submit = pd.DataFrame(np.concatenate(y_ho, axis=1), columns=prediction_names, index=df_feat[ df['is_holdout']].index)
df_submit.shape

In [None]:
df_submit.head()

In [None]:
# plt.plot(df_submit['Use__NO_LABEL'].sort_values().values)
plt.plot(df_submit['Operating_Status__NO_LABEL'].sort_values().values)
plt.show()

In [None]:
test.head()

In [None]:
assert (df_submit['Operating_Status__NO_LABEL']<0.0001).all()
del df_submit['Operating_Status__NO_LABEL']

In [None]:
fn = 'data_out/submission_C3_%s.csv'%(time.strftime("%Y%m%d_%H%M%S"))
df_submit.to_csv(fn)

## RF