In [None]:
# http://stackoverflow.com/questions/44544766/ddg#44547144
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
import numpy as np
import pandas as pd
import time
import seaborn as sns
from matplotlib import pyplot as plt
import keras

In [None]:
def pd_read_csv(fn:str):
    x = pd.read_csv(fn, na_values=['NO_LABEL', '(blank)'])
    assert not x['Unnamed: 0'].duplicated().any()
    x = x.set_index("Unnamed: 0")
    return x

## Read data

In [None]:
train = pd_read_csv('data_in/TrainingData.csv')
test = pd_read_csv('data_in/TestData.csv')

In [None]:
train['Position_Type'].head()

In [None]:
features_raw = list(set(train.columns).intersection(set(test.columns)) - set(['FTE','Total']))
features_raw.sort()
features_raw

In [None]:
target = set(train.columns) - set(test.columns)
target = list(target)
target.sort()
target

In [None]:
for col in target:
    test[col] = np.nan

In [None]:
train.shape, test.shape

In [None]:
train['is_holdout'] = False
test ['is_holdout'] = True
df = pd.concat([train,test], axis=0)
df.shape

In [None]:
meta = list(set(df.columns) - set(features_raw) - set(target))
meta

In [None]:
df.shape, df[features_raw].shape, df[target].shape, df[meta].shape

## Text clustering

In [None]:
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
import re
n_features = 2 ** 18

from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
from sklearn.cluster import KMeans
import numpy as np

def tokens(doc):
    """Extract tokens from doc.

    This uses a simple regex to break strings into tokens. For a more
    principled approach, see CountVectorizer or TfidfVectorizer.
    """
    return (tok.lower() for tok in re.findall(r"\w+", doc))


hasher = FeatureHasher(n_features=n_features, input_type="string")

df_labels = {}
df_centers = {}
for tokenize_column in features_raw: # 'Function_Description'
    print(tokenize_column)
    
    x_in = df[tokenize_column].head(n=2000000000).fillna('')
    x_in = x_in[~x_in.duplicated()]
    # x_in.shape # 705

    X = hasher.transform(tokens(d) for d in x_in)

    # X = sparse_random_matrix(100, 100, density=0.01, random_state=42)
    svd = TruncatedSVD(n_components=3, n_iter=17, random_state=42)
    svd.fit(X)  

    #print(svd.explained_variance_ratio_)  
    # print(svd.explained_variance_ratio_.sum())  
    # print(svd.singular_values_)  

    Y = svd.transform(X)
    
    kmeans = KMeans(n_clusters=10, random_state=0, n_jobs=-1).fit(Y)

    # len(set(kmeans.labels_)), kmeans.labels_
    df_centers[tokenize_column] = Y
    df_labels[ tokenize_column] = pd.DataFrame({'label': kmeans.labels_, 'value': x_in})

    
len(df_labels), len(df_centers)

In [None]:
# replace the label for value="" with "-1"
for x in df_labels:
    print(x)
    df_labels[x].loc[df_labels[x]['value']=='', 'label'] = -1

In [None]:
{x: len(df_labels[x]) for x in df_labels.keys()}

In [None]:
import gc
gc.collect()

In [None]:
# append to originator features
print(df.shape)
features_clustered = [(x, '%s_clustered'%x) for x in features_raw]
for f1,f2 in features_clustered:
    print(f1,f2)

    df = df.merge(df_labels[f1], left_on=f1, right_on='value', how='left')
    del df['value']

    # now replace the numeric label above
    # with the 1st "original" label with this numeric label
    df_syn = df_labels[f1]
    df_syn = df_syn[~df_syn['label'].duplicated()]
    
    #print(df.merge(df_labels[f1], left_on=f1, right_on='value', how='left')[['Facility_or_Department', 'label']].head(n=20).tail(n=10))
    #print(df.merge(df_syn,      left_on=f1, right_on='value', how='left')[['Facility_or_Department', 'label']].head(n=20).tail(n=10))
    #import pdb
    #pdb.set_trace()
    
    #df = df.rename(columns={'label': f2})
    df = df.merge(df_syn, left_on='label', right_on='label', how='left')
    del df['label']
    df = df.rename(columns={'value': f2})
    
    
print(df.shape)

In [None]:
features_clustered = [f2 for f1,f2 in features_clustered]

In [None]:
df.columns

## Read labels

In [None]:
import yaml
labels = yaml.load(open("labels.yml",'r'))

In [None]:
# Function': ['Aides Compensation
prediction_names = []
for k,v1 in labels.items():
    for v2 in v1:
        pn = "%s__%s"%(k,v2)
        prediction_names.append(pn)
        
        
assert 'Function__Aides Compensation' in prediction_names
prediction_names.sort()
prediction_names[:5]

## Calculate counts

In [None]:
df[features_raw].loc[134338].fillna('')

In [None]:
df[features_clustered].loc[134338].fillna('')

In [None]:
def calc_joined(df, features_chosen):
    return df[features_chosen].fillna('').apply(lambda x: "~".join([y.replace(' ','').replace('"','') for y in x]), axis=1)
    
# df['joined'] = calc_joined(df, features_raw)
df['joined'] = calc_joined(df, features_clustered)

In [None]:
# Could calculate stats on *all* df instead of just non-holdout
# but need this so that predictions.index = df_feat2.index later
# stats = df['joined'].value_counts()
stats = df['joined'][~df['is_holdout']].value_counts()

In [None]:
stats.sort_values(ascending=False).head(n=15).reset_index()

In [None]:
k1=stats.index[10]
k1, stats.index.get_loc(k1),1 #df.loc[k1, 'joined']

In [None]:
# plt.bar(x=range(stats.shape[0]), height=stats.values)
# n_pts = 500000
# plt.bar(x=range(n_pts), height=stats.iloc[:n_pts])
plt.plot(stats.sort_values(ascending=False).values.cumsum())
plt.title("%s, %s"%(stats.values.sum(), stats.shape[0]))
plt.show()

In [None]:
k1 = stats.sort_values(ascending=False).reset_index().loc[0]['index']

In [None]:
df[df['joined']==k1].shape, df.shape[0]

In [None]:
df[df['joined']==k1]['Function'].value_counts().reset_index()

In [None]:
import os

# The 1st version turned out to have indeces not in stats.index .. weird
# prob_fn = 'data_out/t1_probabilities_function.pkl'

# Fixing the above in the 2nd version
# prob_fn = 'data_out/t1_probabilities_function_v2.pkl'

# Set ".sort()" on features so that they're replicable
# prob_fn = 'data_out/t1_probabilities_function_v3.pkl'

# Calculate for all targets, not only Functions
# prob_fn = 'data_out/t1_probabilities_function_v4.pkl'

# raw + clustered features
#prob_fn = 'data_out/t1_probabilities_function_v5.pkl'

# only clustered features
prob_fn = 'data_out/t1_probabilities_function_v6.pkl'

os.path.exists(prob_fn)

In [None]:
if os.path.exists(prob_fn):
    print("Loading from file %s"%prob_fn)
    probabilities = pd.read_pickle(prob_fn)
else:
    probabilities = pd.DataFrame(
        np.zeros(shape=(len(prediction_names), stats.shape[0])),
        columns=stats.index,
        index=prediction_names
    )
    #probabilities.head(n=2)
    
    # k1 = stats.index[0]
    # probabilities[k1].update(train[train['joined']==k1]['Function'].value_counts())
    # probabilities[k1]

    n = len(stats.index)
    dependents = list(labels.keys())
    dependents.sort()
    
    n = len(dependents)
    for i,k2 in enumerate(dependents):
        print("%s .. %s: %s / %s"%(time.ctime(), k2, i+1, n))
        # important to filter for "not holdout"
        counts = df[~df['is_holdout']].groupby(['joined']).apply(lambda x: x[k2].value_counts().add_prefix('%s__'%k2)).unstack(0)
        probabilities.update(counts)

    # save
    probabilities.to_pickle(prob_fn)

In [None]:
probabilities.shape

In [None]:
probabilities.head().values.sum(axis=1)

In [None]:
# since NO_LABEL is replaced with NaN, need this
for dependent in labels.keys():
    target_sub = [x for x in probabilities.index if x.startswith("%s__"%dependent)]
    probabilities.loc['%s__NO_LABEL'%dependent, probabilities.loc[target_sub].sum(axis=0)==0]=1

In [None]:
probabilities.shape, probabilities[pd.isnull(probabilities).all(axis=1)].shape, probabilities.loc[:,probabilities.sum(axis=0)==0].shape

In [None]:
assert ~pd.isnull(probabilities).any().any()

In [None]:
probabilities.iloc[:,1]

In [None]:
# k1 = stats.index[1]
# probabilities[k1], (probabilities / probabilities.sum(axis=0))[k1]

for k1 in labels.keys():
    sub_index = [x for x in probabilities.index if x.startswith("%s__"%k1)]
    probabilities.loc[sub_index] = probabilities.loc[sub_index] / probabilities.loc[sub_index].sum(axis=0)
                                                                        
probabilities = probabilities.transpose()
probabilities = probabilities.sort_index()
probabilities.shape

In [None]:
probabilities.iloc[1,:]

In [None]:
probabilities.shape, probabilities[pd.isnull(probabilities).all(axis=1)].shape, probabilities[probabilities.sum(axis=1)==0].shape

In [None]:
assert set(stats.index) == set(probabilities.index)
# len(set(stats.index) - set(probabilities.index))
assert len(set(stats.index) - set(probabilities.index)) >= 0
assert len(set(probabilities.index) - set(stats.index)) == 0

## Bring back original set of columns
These are the fields that got joined with ~

In [None]:
features_chosen = features_clustered
df_feat = df[features_chosen+['joined', 'is_holdout']].copy()
df_feat.loc[:,features_chosen] = df_feat[features_chosen].apply(lambda x: pd.factorize(x)[0], axis=0)
df_feat[features_chosen] = df_feat[features_chosen] + 1 # +1 for the -1 introduced by pd.factorize (keras Embedding supports [0,N) )    
df_feat['duplicated'] = df_feat.duplicated(['joined'])
df_feat2 = df_feat[~df_feat['is_holdout'] & ~df_feat.duplicated(['joined'])].set_index('joined').sort_index()[features_chosen]

In [None]:
df_feat2.head(n=2)

In [None]:
df_feat.head(n=2)

In [None]:
df[features_chosen+['joined']].head(n=2)

In [None]:
# list(set(probabilities.index) - set(df_feat2.index))[:5]
# list(set(df_feat2.index) - set(probabilities.index))[:5]

k1 = '~RGNGOB~CONVERSIONCHARTERSCHOOLS~~~EMPLOYEEBENEFITS~~~~PurchasedServices~~~~'
# k1 in stats.index
# k1 in df_feat2.index
df_feat[['is_holdout','duplicated','joined']][df_feat['joined']==k1]
# df['joined'][df['joined']==k1].shape
# k1 in df['joined'].value_counts().index
# stats[k1]

In [None]:
assert set(probabilities.index) == set(df_feat2.index)
# len(set(probabilities.index)), len(set(df_feat2.index)), len(set(probabilities.index) - set(df_feat2.index))
assert len(set(probabilities.index) - set(df_feat2.index)) == 0
# len(set(df_feat2.index) - set(probabilities.index))
assert len(set(df_feat2.index) - set(probabilities.index)) >= 0

In [None]:
# append "set_index" as recommended in pandas github issue 7632
# https://github.com/pandas-dev/pandas/issues/7632#issuecomment-316806258
# prob2 = probabilities.merge(vocabulary, left_index=True, right_on='joined', how='left').set_index('joined')
prob2 = probabilities.merge(df_feat2, left_index=True, right_index=True, how='left') # .set_index('joined')
probabilities.shape, prob2.shape # , prob2.head(n=2), prob2.head(n=2).index # , train.loc[70455]

In [None]:
prob2[features_chosen].sort_index().head(n=2)

In [None]:
prob3 = prob2[features_chosen]

In [None]:
assert ~pd.isnull(prob3).any().any()

In [None]:
prob3.max().max(), prob3.min().min()

In [None]:
prob3.shape, probabilities.shape

In [None]:
vocab_size = df_feat[features_chosen].max(axis=0) + 1 # +1 for the 0
pd.DataFrame({'all': vocab_size, 'non-holdout': prob3.max(axis=0)})

In [None]:
assert probabilities.max().max()==1

In [None]:
df.shape, prob3.shape, probabilities.shape

## split train/test on non-holdout data

In [None]:
x = prob3
y = probabilities # .fillna(0)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)


## keras embedding + Dense/LSTM

In [None]:
# make array of features
x_train = [x_train[f].values for f in vocab_size.index]
x_test  = [x_test [f].values for f in vocab_size.index]

In [None]:
label_keys = labels.keys()
label_keys = list(label_keys)
label_keys.sort()
y_train = [y_train[[x for x in prediction_names if x.startswith("%s__"%f)]].values for f in label_keys]
y_test  = [y_test [[x for x in prediction_names if x.startswith("%s__"%f)]].values for f in label_keys]

In [None]:
len(y_train), y_train[0].shape, y_train[1].shape

In [None]:
from keras.layers import Embedding, Dense, Flatten, LSTM, Input, Concatenate, Add, Lambda
from keras.models import Sequential, Model
from keras import backend as K

# vocab_size = stats.shape[0]

# inputs = [Input(shape=(prob3.shape[1],)) for f in vocab_size.index]
inputs = {f: Input(shape=(1,), name=f) for f in vocab_size.index}

# embeddings = [Embedding(vocab_size[f], embedding_dim, input_length=prob3.shape[1]) for f in vocab_size.index]

if True:
    embedding_dim = 3 # 12 # 2 # 64 # FIXME
    embeddings = [Embedding(vocab_size[f], embedding_dim, input_length=1)(inputs[f]) for f in vocab_size.index]
else:
    embeddings = [Embedding(vocab_size[f], max(3, vocab_size[f]//15//10), input_length=1)(inputs[f]) for f in vocab_size.index]

# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, input_length, embedding_dim), where None is the batch dimension.

x1 = Concatenate()(embeddings)

x1 = Flatten()(x1)
#x1 = Dense( 500, activation='relu')(x1)
# x1 = Dense(  50, activation='relu')(x1)

# x1 = Dense(  50, activation='relu')(x1)
# x1 = Dense(  50, activation='relu')(x1)

x1 = Dense( 50, activation='relu')(x1)
x1 = Dense( 50, activation='relu')(x1)

o1 = {dependent: Dense(50, activation = 'relu', name="%s_d1"%dependent)(x1) for dependent in label_keys}
o1 = {dependent: Dense(50, activation = 'relu', name="%s_d2"%dependent)(o1[dependent]) for dependent in label_keys}

outputs = [Dense(len(labels[dependent]), activation = 'softmax', name="%s_out"%dependent)(o1[dependent]) for dependent in label_keys]

inputs = [inputs[f] for f in vocab_size.index]
model = Model(inputs=inputs, outputs=outputs)
model.summary()

In [None]:
x_train[-1].max()

In [None]:
model.compile('rmsprop', loss='categorical_crossentropy', metrics=['acc'])
#model.compile('rmsprop', loss=multi_multiclass_logloss, metrics=['acc'])

# from keras.utils import to_categorical
# y_binary = to_categorical(np.argmax(y_train.values, axis=1))
# y_binary = np.argmax(y_train.values, axis=1).squeeze()

model.fit(
    # pd.get_dummies(train3['x'].values),
    # # train2[list(set(train2.columns) - set(['joined']))],
    # train3['y'].values,
    x_train,
    y_train,
    batch_size=32*32, # 32, # FIXME
    epochs=300,
    verbose=2,
    validation_split = 0.2,
    shuffle=False
)

In [None]:
model.evaluate(x_test, y_test)

## argmax accuracy

## Spatial comparison

In [None]:
for i in range(min(10,probabilities.shape[1])):
    n_show = 1000
    y_pred = model.predict(x_test)
    y_pred2 = y_pred[0]
    y_test2 = y_test[0]

    plt.figure(figsize=(20,3))
    plt.plot(y_pred2[:n_show,i], label='pred')
    #plt.plot(sum_pred, label='sum_pred', alpha=0.2)
    plt.plot(y_test2[:n_show,i], '.', label='actual')
    plt.legend(loc='best')
    plt.title(probabilities.columns[i])
    
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    plt.show()

In [None]:
y_test[0][0,:].sum(), y_pred[0][0,:].sum() # , y_pred[0]

## temporal comparison

In [None]:
y_pred = model.predict(x_test)
y_pred = {label_keys[i]: y_pred[i] for i in range(len(label_keys))}

k2 = 'Function'
y_pred2 = y_pred[k2]
y_test2 = y_test[0]
for i in range(15):
    plt.figure(figsize=(10,3))
    
    plt.subplot(121)
    plt.bar(x=range(y_pred2.shape[1]), height=y_test2[i])
    plt.title('%s. actual, argmax=%s'%(i,np.argmax(y_test2[i])))
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    
    plt.subplot(122)
    plt.bar(x=range(y_pred2.shape[1]), height=y_pred2[i])
    plt.title('%s. prediction, argmax=%s'%(i,np.argmax(y_pred2[i])))
    axes = plt.gca()
    axes.set_ylim([-.1,1.1])
    
    # plt.title(y_test.index[i])
    
    plt.show()

In [None]:
i=6
y_test[0][i,:]

## Mock submission

In [None]:
x_ho = df_feat[features_chosen][~df['is_holdout']].head()
x_ho  = [x_ho [f].values for f in vocab_size.index]
y_ho = model.predict(x_ho)
df_submit = pd.DataFrame(np.concatenate(y_ho, axis=1), columns=prediction_names)
df_submit.shape

In [None]:
df_submit.head()

In [None]:
df[target].head()

## Prepare submission

In [None]:
df.shape, df_feat.shape

In [None]:
x_ho = df_feat[features_chosen][ df['is_holdout']]
x_ho  = [x_ho [f].values for f in vocab_size.index]
y_ho = model.predict(x_ho)

In [None]:
len(y_ho), y_ho[0].shape, y_ho[1].shape

In [None]:
df_ho  = df[ df['is_holdout']]
df_nho = df[~df['is_holdout']]
df_ho.shape, df_ho[df_ho['joined'].isin(df_nho['joined'])].shape

In [None]:
df_ho[features_chosen].shape

In [None]:
df_ho[features_chosen].head(n=2)

In [None]:
df_ho['joined'].head(n=2)

In [None]:
df_nho[df_nho['Sub_Object_Description']=="Line Item that is paid with Campus' money"].shape

In [None]:
df['is_holdout'].index

In [None]:
df_submit = pd.DataFrame(np.concatenate(y_ho, axis=1), columns=prediction_names, index=df_feat[ df['is_holdout']].index)
df_submit.shape

In [None]:
df_submit.head()

In [None]:
# plt.plot(df_submit['Use__NO_LABEL'].sort_values().values)
plt.plot(df_submit['Operating_Status__NO_LABEL'].sort_values().values)
plt.show()

In [None]:
test.head()

In [None]:
# assert (df_submit['Operating_Status__NO_LABEL']==0).all()
assert (df_submit['Operating_Status__NO_LABEL']<.000001).all()
del df_submit['Operating_Status__NO_LABEL']

In [None]:
fn = 'data_out/submission_B_%s.csv'%(time.strftime("%Y%m%d_%H%M%S"))
df_submit.to_csv(fn)