In [1]:
import argparse
import os
import random as rn

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use(['seaborn-white', 'seaborn-paper'])
sns.set_context("paper", font_scale=1.3)
import pandas as pd
import numpy as np
import sys
import tensorflow as tf
import umap
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from sklearn import metrics
from sklearn import mixture
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.manifold import Isomap
from sklearn.manifold import LocallyLinearEmbedding
from scipy.optimize import linear_sum_assignment as linear_assignment
from time import time
from sklearn.model_selection import train_test_split
from transformers import (
   AutoConfig,
   AutoTokenizer,
   TFAutoModelForSequenceClassification,
   AdamW,
   glue_convert_examples_to_features
)

In [2]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(sys.argv[2])
os.environ['PYTHONHASHSEED'] = '0'
os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'

rn.seed(0)
tf.random.set_seed(0)
np.random.seed(0)

#session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1,)
#sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
#K.set_session(sess)
np.set_printoptions(threshold=sys.maxsize)

matplotlib.use('agg')

In [3]:
def eval_other_methods(x, y, names=None):
    gmm = mixture.GaussianMixture(
        covariance_type='full',
        n_components=n_clusters,
        random_state=0)
    gmm.fit(x)
    y_pred_prob = gmm.predict_proba(x)
    y_pred = y_pred_prob.argmax(1)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print("GMM clustering on raw data")
    print('=' * 80)
    print(acc)
    print(nmi)
    print(ari)
    print('=' * 80)

    y_pred = KMeans(
        n_clusters=n_clusters,
        random_state=0).fit_predict(x)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print("K-Means clustering on raw data")
    print('=' * 80)
    print(acc)
    print(nmi)
    print(ari)
    print('=' * 80)

    sc = SpectralClustering(
        n_clusters=n_clusters,
        random_state=0,
        affinity='nearest_neighbors')
    y_pred = sc.fit_predict(x)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print("Spectral Clustering on raw data")
    print('=' * 80)
    print(acc)
    print(nmi)
    print(ari)
    print('=' * 80)

    if manifold_learner == 'UMAP':
        md = float(umap_min_dist)
        hle = umap.UMAP(
            random_state=0,
            metric=umap_metric,
            n_components=umap_dim,
            n_neighbors=umap_neighbors,
            min_dist=md).fit_transform(x)
    elif manifold_learner == 'LLE':
        from sklearn.manifold import LocallyLinearEmbedding
        hle = LocallyLinearEmbedding(
            n_components=umap_dim,
            n_neighbors=umap_neighbors).fit_transform(x)
    elif manifold_learner == 'tSNE':
        method = 'exact'
        hle = TSNE(
            n_components=umap_dim,
            n_jobs=16,
            random_state=0,
            verbose=0).fit_transform(x)
    elif manifold_learner == 'isomap':
        hle = Isomap(
            n_components=umap_dim,
            n_neighbors=5,
        ).fit_transform(x)

    gmm = mixture.GaussianMixture(
        covariance_type='full',
        n_components=n_clusters,
        random_state=0)
    gmm.fit(hle)
    y_pred_prob = gmm.predict_proba(hle)
    y_pred = y_pred_prob.argmax(1)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print("GMM clustering on " +
          str(manifold_learner) + " embedding")
    print('=' * 80)
    print(acc)
    print(nmi)
    print(ari)
    print('=' * 80)

    if visualize:
        plot(hle, y, 'UMAP', names)
        y_pred_viz, _, _ = best_cluster_fit(y, y_pred)
        plot(hle, y_pred_viz, 'UMAP-predicted', names)

        return

    y_pred = KMeans(
        n_clusters=n_clusters,
        random_state=0).fit_predict(hle)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(dataset + " | K-Means " +
          str(manifold_learner) + " embedding")
    print('=' * 80)
    print(acc)
    print(nmi)
    print(ari)
    print('=' * 80)

    sc = SpectralClustering(
        n_clusters=n_clusters,
        random_state=0,
        affinity='nearest_neighbors')
    y_pred = sc.fit_predict(hle)
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(dataset + " | Spectral Clustering on " +
          str(manifold_learner) + " embedding")
    print('=' * 80)
    print(acc)
    print(nmi)
    print(ari)
    print('=' * 80)

In [4]:
def cluster_manifold_in_embedding(hl, y, label_names=None):
    # find manifold on autoencoded embedding
    if manifold_learner == 'UMAP':
        md = float(umap_min_dist)
        hle = umap.UMAP(
            random_state=0,
            metric=umap_metric,
            n_components=umap_dim,
            n_neighbors=umap_neighbors,
            min_dist=md).fit_transform(hl)
    elif manifold_learner == 'LLE':
        hle = LocallyLinearEmbedding(
            n_components=umap_dim,
            n_neighbors=umap_neighbors).fit_transform(hl)
    elif manifold_learner == 'tSNE':
        hle = TSNE(
            n_components=umap_dim,
            n_jobs=16,
            random_state=0,
            verbose=0).fit_transform(hl)
    elif manifold_learner == 'isomap':
        hle = Isomap(
            n_components=umap_dim,
            n_neighbors=5,
        ).fit_transform(hl)

    # clustering on new manifold of autoencoded embedding
    if cluster == 'GMM':
        gmm = mixture.GaussianMixture(
            covariance_type='full',
            n_components=n_clusters,
            random_state=0)
        gmm.fit(hle)
        y_pred_prob = gmm.predict_proba(hle)
        y_pred = y_pred_prob.argmax(1)
    elif cluster == 'KM':
        km = KMeans(
            init='k-means++',
            n_clusters=n_clusters,
            random_state=0,
            n_init=20)
        y_pred = km.fit_predict(hle)
    elif cluster == 'SC':
        sc = SpectralClustering(
            n_clusters=n_clusters,
            random_state=0,
            affinity='nearest_neighbors')
        y_pred = sc.fit_predict(hle)

    y_pred = np.asarray(y_pred)
    # y_pred = y_pred.reshape(len(y_pred), )
    y = np.asarray(y)
    # y = y.reshape(len(y), )
    acc = np.round(cluster_acc(y, y_pred), 5)
    nmi = np.round(metrics.normalized_mutual_info_score(y, y_pred), 5)
    ari = np.round(metrics.adjusted_rand_score(y, y_pred), 5)
    print(manifold_learner +
          " on autoencoded embedding with " + cluster + " - N2D")
    print('=' * 80)
    print(acc)
    print(nmi)
    print(ari)
    print('=' * 80)

    if visualize:
        plot(hle, y, 'n2d', label_names)
        y_pred_viz, _, _ = best_cluster_fit(y, y_pred)
        plot(hle, y_pred_viz, 'n2d-predicted', label_names)

    return y_pred, acc, nmi, ari

In [5]:
def best_cluster_fit(y_true, y_pred):
    y_true = y_true.astype(np.int64)
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1

    ind = linear_assignment(w.max() - w)
    best_fit = []
    for i in range(y_pred.size):
        for j in range(len(ind)):
            if ind[j][0] == y_pred[i]:
                best_fit.append(ind[j][1])
    return best_fit, ind, w


In [6]:
def cluster_acc(y_true, y_pred):
    _, ind, w = best_cluster_fit(y_true, y_pred)
    return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size

In [7]:
def plot(x, y, plot_id, names=None):
    viz_df = pd.DataFrame(data=x[:5000])
    viz_df['Label'] = y[:5000]
    if names is not None:
        viz_df['Label'] = viz_df['Label'].map(names)

    #viz_df.to_csv(args.save_dir + '/' + args.dataset + '.csv')
    plt.subplots(figsize=(8, 5))
    sns.scatterplot(x=0, y=1, hue='Label', legend='full', hue_order=sorted(viz_df['Label'].unique()),
                    palette=sns.color_palette("hls", n_colors=n_clusters),
                    alpha=.5,
                    data=viz_df)
    l = plt.legend(bbox_to_anchor=(-.1, 1.00, 1.1, .5), loc="lower left", markerfirst=True,
                   mode="expand", borderaxespad=0, ncol=n_clusters + 1, handletextpad=0.01, )

    l.texts[0].set_text("")
    plt.ylabel("")
    plt.xlabel("")
    plt.tight_layout()
    #plt.savefig(args.save_dir + '/' + args.dataset +
                #'-' + plot_id + '.png', dpi=300)
    plt.clf()

In [8]:
def autoencoder(dims, act='relu'):
    n_stacks = len(dims) - 1
    x = Input(shape=(dims[0],), name='input')
    h = x
    for i in range(n_stacks - 1):
        h = Dense(dims[i + 1], activation=act, name='encoder_%d' % i)(h)
    h = Dense(dims[-1], name='encoder_%d' % (n_stacks - 1))(h)
    for i in range(n_stacks - 1, 0, -1):
        h = Dense(dims[i], activation=act, name='decoder_%d' % i)(h)
    h = Dense(dims[0], name='decoder_0')(h)

    return Model(inputs=x, outputs=h)

In [9]:
# Choose model
# @markdown >The default model is <i><b>COVID-Twitter-BERT</b></i>. You can however choose <i><b>BERT Base</i></b> or <i><b>BERT Large</i></b> to compare these models to the <i><b>COVID-Twitter-BERT</i></b>. All these three models will be initiated with a random classification layer. If you go directly to the Predict-cell after having compiled the model, you will see that it still runs the predition. However the output will be random. The training steps below will finetune this for the specific task. <br /><br /> 
model_name = 'digitalepidemiologylab/covid-twitter-bert-v2' #@param ["digitalepidemiologylab/covid-twitter-bert", "bert-large-uncased", "bert-base-uncased"]

# Initialise tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

# Training Paremeters
max_seq_length = 128 #@param {type: "integer"}
train_batch_size =  8 #@param {type: "integer"} 
eval_batch_size = 8 #@param {type: "integer"}
num_labels = 15

# Loading the Training dataset
t_train = pd.read_csv("..\\raw\\cluster_input.csv", names=('text', 'label', 'topic', 'topic_label'))
#t_train = t_train.sample(n=30000)

#t_train["text"] = t_train["text"].apply(remove_contractions)
#t_train["text"] = t_train["text"].apply(clean_text)
#t_train.drop_duplicates(subset=["text"], inplace=True)
#t_train.dropna(inplace=True)

#X_train = t_train["text"][:len(df)*0.64]
#X_val = t_train["text"][len(df)*0.64:len(df)*0.8]
#X_test = t_train["text"][len(df)*0.8:]

X_train, X_test, y_train, y_test = train_test_split(t_train['text'], t_train['topic_label'], test_size=0.2, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=1) # 0.25 x 0.8 = 0.2

train_text = []
#val_text = []
test_text = []

train_label = []
#val_label = []
test_label = []

for row in X_train:
    train_text.append(str(row))
    
#for row in X_val:
    #val_text.append(str(row))
    
for row in X_test:
    test_text.append(str(row))
    
for row in y_train:
    train_label.append(int(row))

#for row in y_val:
    #val_label.append(int(row))
    
for row in y_test:
    test_label.append(int(row))

#train_text = tokenizer(train_text, max_length=max_seq_length, truncation=True, padding=True)
#val_text = tokenizer(val_text, max_length=max_seq_length, truncation=True, padding=True)
#test_text = tokenizer(test_text, max_length=max_seq_length, truncation=True, padding=True)

train = np.zeros([np.size(train_text), max_seq_length], dtype=int)
#val = np.zeros([np.size(val_text), max_seq_length], dtype=int)
test = np.zeros([np.size(test_text), max_seq_length], dtype=int)

for i in range(len(train_text)):
    tokens = np.asarray(tokenizer.encode(train_text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        train[i][j] = tokens[j]

'''
for i in range(len(val_text)):
    tokens = np.asarray(tokenizer.encode(val_text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        val[i][j] = tokens[j]
'''

for i in range(len(test_text)):
    tokens = np.asarray(tokenizer.encode(test_text[i], max_length=max_seq_length, truncation=True, padding=True))
    for j in range(np.size(tokens)):
        test[i][j] = tokens[j]


#a = tokenizer.encode(train_text[0], max_length=max_seq_length, truncation=True, padding=True)
#print(np.shape(a))


#train_text = np.asarray(train_text)
#val_text = np.asarray(val_text)
#test_text = np.asarray(test_text)

train_label = np.asarray(train_label)
#val_label = np.asarray(val_label)
test_label = np.array(test_label)

label_mapping = {0:'Masks', 1:'Vaccine', 2:'Symptoms', 3:'Quarantine', 4:'Lockdown', 5:'Education', 6:'Treatment',
                 7:'Science', 8:'Statistics', 9:'Health', 10:'Work', 11:'Legislation', 12:'Politics', 13:'Travel', 14:'Testing'}

label_names = ['Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment',
                 'Science', 'Statistics', 'Health', 'Work', 'Legislation', 'Politics', 'Travel', 'Testing']

In [10]:
global umap_dim
global umap_neighbors
global umap_min_dist
global umap_metric
global cluster
global visualize
global manifold_learner
global n_clusters

umap_dim = 2
umap_neighbors = 10
umap_min_dist = "0.00"
umap_metric = 'euclidean'
cluster = 'GMM' # options = GMM, KM and SC
visualize = True
manifold_learner = 'UMAP' # options = UMAP, LLE, tSNE, isomap
n_clusters = 15

In [15]:
optimizer = 'adam'

#shape = [train.shape[-1], 500, 500, 2000, n_clusters]
#autoencoder = autoencoder(shape)

#hidden = autoencoder.get_layer(name='encoder_%d' % (len(shape) - 2)).output
#encoder = Model(inputs=autoencoder.input, outputs=hidden)

#pretrain_time = time()

# Pretrain autoencoders before clustering
#autoencoder.compile(loss='mse', optimizer=optimizer)
#autoencoder.fit(train, train, batch_size=train_batch_size, epochs=10, verbose=0)

#pretrain_time = time() - pretrain_time

#hl = encoder.predict(train)
eval_other_methods(train, train_label, label_names)
#clusters, t_acc, t_nmi, t_ari = cluster_manifold_in_embedding(hl, y, label_names)

ValueError: too many values to unpack (expected 2)