In [None]:
# http://stackoverflow.com/questions/44544766/ddg#44547144
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
import numpy as np
import pandas as pd
import time
import seaborn as sns
from matplotlib import pyplot as plt
import keras

In [None]:
def pd_read_csv(fn:str):
    x = pd.read_csv(fn, na_values=['NO_LABEL', '(blank)'])
    assert not x['Unnamed: 0'].duplicated().any()
    x = x.set_index("Unnamed: 0")
    return x

## Peek at train

In [None]:
train = pd_read_csv('data_in/TrainingData.csv')
train.shape

In [None]:
train.columns

## Peek at test

In [None]:
test = pd_read_csv('data_in/TestData.csv')
test.shape

In [None]:
test.columns

## Diff columns

In [None]:
#train.loc[134338]
# pd.isnull(train['Function']).any()
# pd.isnull(test['Function']).any()
set(train.columns) - set(test.columns), set(test.columns) - set(train.columns)

## Read labels

In [None]:
import yaml
labels = yaml.load(open("labels.yml",'r'))

In [None]:
# Function': ['Aides Compensation
prediction_names = []
for k,v1 in labels.items():
    for v2 in v1:
        prediction_names.append("%s__%s"%(k,v2))
        
        
assert 'Function__Aides Compensation' in prediction_names
prediction_names.sort()
prediction_names[:5]

## Calculate counts

In [None]:
features = list(set(train.columns).intersection(set(test.columns)) - set(['FTE','Total']))
features.sort()
features

In [None]:
train[features].loc[134338].fillna('')

In [None]:
def calc_joined(df):
    return df[features].fillna('').apply(lambda x: "~".join([y.replace(' ','').replace('"','') for y in x]), axis=1)
    
train['joined'] = calc_joined(train)

In [None]:
stats = train['joined'].value_counts()

In [None]:
stats.sort_values(ascending=False).head(n=15).reset_index()

In [None]:
stats = stats.sort_index()
stats.head()

In [None]:
k1=stats.index[10]
stats.index.get_loc(k1)

In [None]:
# plt.bar(x=range(stats.shape[0]), height=stats.values)
# n_pts = 500000
# plt.bar(x=range(n_pts), height=stats.iloc[:n_pts])
plt.plot(stats.sort_values(ascending=False).values.cumsum())
plt.title("%s, %s"%(stats.values.sum(), stats.shape[0]))
plt.show()

In [None]:
k1 = stats.sort_values(ascending=False).reset_index().loc[0]['index']

In [None]:
train[train['joined']==k1].shape, train.shape[0]

In [None]:
train[train['joined']==k1]['Function'].value_counts().reset_index()

In [None]:
import os

# The 1st version turned out to have indeces not in stats.index .. weird
# prob_fn = 'data_out/t1_probabilities_function.pkl'

# Fixing the above in the 2nd version
# prob_fn = 'data_out/t1_probabilities_function_v2.pkl'

# Set ".sort()" on features so that they're replicable
# prob_fn = 'data_out/t1_probabilities_function_v3.pkl'

# Calculate for all targets, not only Functions
prob_fn = 'data_out/t1_probabilities_function_v4.pkl'

os.path.exists(prob_fn)

In [None]:
if os.path.exists(prob_fn):
    print("Loading from file %s"%prob_fn)
    probabilities = pd.read_pickle(prob_fn)
else:
    probabilities = pd.DataFrame(
        np.zeros(shape=(len(prediction_names), stats.shape[0])),
        columns=stats.index,
        index=prediction_names
    )
    #probabilities.head(n=2)
    
    # k1 = stats.index[0]
    # probabilities[k1].update(train[train['joined']==k1]['Function'].value_counts())
    # probabilities[k1]

    n = len(stats.index)
    dependents = list(labels.keys())
    dependents.sort()
    
    n = len(dependents)
    for i,k2 in enumerate(dependents):
        print("%s .. %s: %s / %s"%(time.ctime(), k2, i+1, n))
        counts = train.groupby(['joined']).apply(lambda x: x[k2].value_counts().add_prefix('%s__'%k2)).unstack(0)
        probabilities.update(counts)
    
    """
    for k2 in dependents:
        for i,k1 in enumerate(stats.index):
            if i % 1000 == 0: print("%s .. %s: %s / %s"%(time.ctime(), k2, i, n))
            # counts = train[train['joined']==k1][k2].value_counts()
            counts = train.groupby(['joined']).apply(lambda x: x[k2].value_counts())
            counts.index = ["%s__%s"%(k2, x) for x in counts.index]
            probabilities[k1].update(counts)
    """

    # save
    probabilities.to_pickle(prob_fn)

In [None]:
probabilities.shape

In [None]:
probabilities.head().values.sum(axis=1)

In [None]:
# since NO_LABEL is replaced with NaN, need this
for dependent in labels.keys():
    target_sub = [x for x in probabilities.index if x.startswith("%s__"%dependent)]
    probabilities.loc['%s__NO_LABEL'%dependent, probabilities.loc[target_sub].sum(axis=0)==0]=1

In [None]:
probabilities.shape, probabilities[pd.isnull(probabilities).all(axis=1)].shape, probabilities.loc[:,probabilities.sum(axis=0)==0].shape

In [None]:
assert ~pd.isnull(probabilities).any().any()

In [None]:
probabilities.iloc[:,1]

In [None]:
# k1 = stats.index[1]
# probabilities[k1], (probabilities / probabilities.sum(axis=0))[k1]

for k1 in labels.keys():
    sub_index = [x for x in probabilities.index if x.startswith("%s__"%k1)]
    probabilities.loc[sub_index] = probabilities.loc[sub_index] / probabilities.loc[sub_index].sum(axis=0)
                                                                        
probabilities = probabilities.transpose()
probabilities = probabilities.sort_index()
probabilities.shape

In [None]:
probabilities.iloc[1,:]

In [None]:
probabilities.shape, probabilities[pd.isnull(probabilities).all(axis=1)].shape, probabilities[probabilities.sum(axis=1)==0].shape

In [None]:
assert set(stats.index) == set(probabilities.index)
assert len(set(stats.index) - set(probabilities.index)) == 0
assert len(set(probabilities.index) - set(stats.index)) == 0

## Bring back original set of columns
These are the fields that got joined with ~

In [None]:
vocabulary = train[~train.duplicated(['joined'])][features+['joined']].set_index('joined').sort_index()

In [None]:
vocabulary.head(n=2)

In [None]:
assert set(probabilities.index) == set(vocabulary.index)
assert len(set(probabilities.index) - set(vocabulary.index)) == 0
assert len(set(vocabulary.index) - set(probabilities.index)) == 0

In [None]:
# append "set_index" as recommended in pandas github issue 7632
# https://github.com/pandas-dev/pandas/issues/7632#issuecomment-316806258
# prob2 = probabilities.merge(vocabulary, left_index=True, right_on='joined', how='left').set_index('joined')

prob2 = probabilities.merge(vocabulary, left_index=True, right_index=True, how='left') # .set_index('joined')
probabilities.shape, prob2.shape # , prob2.head(n=2), prob2.head(n=2).index # , train.loc[70455]

In [None]:
prob2[features].sort_index().head(n=2)

In [None]:
print(time.ctime())
prob3 = prob2[features].apply(lambda x: pd.factorize(x)[0], axis=0)
prob3 = prob3 + 1 # +1 for the -1 (keras Embedding supports [0,N) )    
print(time.ctime())

In [None]:
prob3.max().max(), prob3.min().min()

In [None]:
prob3.shape, probabilities.shape

In [None]:
prob3.max(axis=0)

In [None]:
assert probabilities.max().max()==1

## split hold-out

In [None]:
x = prob3
y = probabilities # .fillna(0)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)


## keras embedding + Dense/LSTM

In [None]:
vocab_size = prob3.max(axis=0) + 1 # +1 for the 0

# make array of features
x_train = [x_train[f].values for f in vocab_size.index]
x_test  = [x_test [f].values for f in vocab_size.index]

In [None]:
label_keys = labels.keys()
label_keys = list(label_keys)
label_keys.sort()
y_train = [y_train[[x for x in prediction_names if x.startswith("%s__"%f)]].values for f in label_keys]
y_test  = [y_test [[x for x in prediction_names if x.startswith("%s__"%f)]].values for f in label_keys]

In [None]:
len(y_train), y_train[0].shape, y_train[1].shape

In [None]:
#vocab_size.index, prob3.columns
vocab_size

In [None]:
probabilities.shape[1]

In [None]:
from keras.layers import Embedding, Dense, Flatten, LSTM, Input, Concatenate, Add, Lambda
from keras.models import Sequential, Model
from keras import backend as K

# vocab_size = stats.shape[0]

# inputs = [Input(shape=(prob3.shape[1],)) for f in vocab_size.index]
inputs = {f: Input(shape=(1,), name=f) for f in vocab_size.index}

# embeddings = [Embedding(vocab_size[f], embedding_dim, input_length=prob3.shape[1]) for f in vocab_size.index]

if True:
    embedding_dim = 3 # 12 # 2 # 64 # FIXME
    embeddings = [Embedding(vocab_size[f], embedding_dim, input_length=1)(inputs[f]) for f in vocab_size.index]
else:
    embeddings = [Embedding(vocab_size[f], max(3, vocab_size[f]//15//10), input_length=1)(inputs[f]) for f in vocab_size.index]

# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, input_length, embedding_dim), where None is the batch dimension.

x1 = Concatenate()(embeddings)

x1 = Flatten()(x1)
#x1 = Dense( 500, activation='relu')(x1)
# x1 = Dense(  50, activation='relu')(x1)

# x1 = Dense(  50, activation='relu')(x1)
# x1 = Dense(  50, activation='relu')(x1)

x1 = Dense( 50, activation='relu')(x1)
x1 = Dense( 50, activation='relu')(x1)

o1 = {dependent: Dense(50, activation = 'relu', name="%s_d1"%dependent)(x1) for dependent in label_keys}
o1 = {dependent: Dense(50, activation = 'relu', name="%s_d2"%dependent)(o1[dependent]) for dependent in label_keys}

outputs = [Dense(len(labels[dependent]), activation = 'softmax', name="%s_out"%dependent)(o1[dependent]) for dependent in label_keys]

inputs = [inputs[f] for f in vocab_size.index]
model = Model(inputs=inputs, outputs=outputs)
model.summary()

In [None]:
def multi_multiclass_logloss(y_true, y_pred):
    # when statring to use Function and others, use this
    # return K.mean(-1*K.mean(K.batch_dot(y_true, K.log(y_pred)), axis=-1), axis=-1)
    # _epsilon = K.epsilon() * K.ones_like(y_true)
    y_pred2 = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    output = -1*K.mean(K.batch_dot(K.transpose(y_true), K.log(y_pred2), axes=[0,1]), axis=-1)
    return output

# model.compile('rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.compile('rmsprop', loss=multi_multiclass_logloss, metrics=['acc'])

In [None]:
# test K.batch_dot
# Need to specify axes due to issue
# https://github.com/keras-team/keras/issues/9847
x_batch = K.ones(shape=(32, 37))
y_batch = K.ones(shape=(32, 37))
# xy_batch_dot = K.batch_dot(K.transpose(x_batch), y_batch, axes=[0, 1])
xy_batch_dot = multi_multiclass_logloss(x_batch, y_batch)
K.int_shape(xy_batch_dot), K.eval(xy_batch_dot)

In [None]:
# from keras.utils import to_categorical
# y_binary = to_categorical(np.argmax(y_train.values, axis=1))
# y_binary = np.argmax(y_train.values, axis=1).squeeze()

model.fit(
    # pd.get_dummies(train3['x'].values),
    # # train2[list(set(train2.columns) - set(['joined']))],
    # train3['y'].values,
    x_train,
    y_train,
    batch_size=32*32, # 32, # FIXME
    epochs=300,
    verbose=2,
    validation_split = 0.2,
    shuffle=False
)

In [None]:
model.evaluate(x_test, y_test)

## argmax accuracy

In [None]:
my_score = np.zeros(y_test.shape[0], dtype='uint8')
# y_pred, sum_pred = model.predict([x_test[f].values for f in vocab_size.index])
y_pred = model.predict([x_test[f].values for f in vocab_size.index])
for i in range(y_test.shape[0]):
    v1 = y_test.iloc[i].idxmax()
    v2 = probabilities.columns[np.argmax(y_pred[i])]
    my_score[i] = 1 if (v1 == v2) else 0

In [None]:
sum(my_score), my_score.shape[0], sum(my_score)*100 // my_score.shape[0]

## Spatial comparison

In [None]:
for i in range(min(10,probabilities.shape[1])):
    n_show = 1000
    y_pred = model.predict(x_test)
    y_pred2 = y_pred[0]
    y_test2 = y_test[0]

    plt.figure(figsize=(20,3))
    plt.plot(y_pred2[:n_show,i], label='pred')
    #plt.plot(sum_pred, label='sum_pred', alpha=0.2)
    plt.plot(y_test2[:n_show,i], '.', label='actual')
    plt.legend(loc='best')
    plt.title(probabilities.columns[i])
    
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    plt.show()

In [None]:
y_test.iloc[0].sum(), y_pred[0].sum() # , y_pred[0]

## temporal comparison

In [None]:
y_pred = model.predict(x_test)
y_pred = {label_keys[i]: y_pred[i] for i in range(len(label_keys))}

k2 = 'Function'
y_pred2 = y_pred[k2]
y_test2 = y_test[0]
for i in range(15):
    plt.figure(figsize=(10,3))
    
    plt.subplot(121)
    plt.bar(x=range(y_pred2.shape[1]), height=y_test2[i])
    plt.title('%s. actual, argmax=%s'%(i,np.argmax(y_test2[i])))
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    
    plt.subplot(122)
    plt.bar(x=range(y_pred2.shape[1]), height=y_pred2[i])
    plt.title('%s. prediction, argmax=%s'%(i,np.argmax(y_pred2[i])))
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    
    # plt.title(y_test.index[i])
    
    plt.show()

In [None]:
i=6
y_test.iloc[i].values

## Prepare submission

In [None]:
print(test.shape)
test['joined'] = calc_joined(test)
print(test.shape)

In [None]:
test.shape, test[test['joined'].isin(train['joined'])].shape

In [None]:
test[features].shape

In [None]:
test[features].head(n=2)

In [None]:
test['joined'].head(n=2)

In [None]:
train[train['Sub_Object_Description']=="Line Item that is paid with Campus' money"].shape

## RF