In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

In [None]:
import keras
import tensorflow as tf


config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 56} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import plotly
import plotly.graph_objs as go
import chart_studio.plotly as py
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import seaborn as sns
from sklearn.manifold import TSNE
import shap
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import euclidean, cosine
import umap
import random
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, homogeneity_score
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
df = pd.read_csv('../datasets/yelp_dataset.csv')
df.head()

In [None]:
df['label'].hist()

In [None]:
df.dropna(inplace=True)

In [None]:
x = df['processed'].values
y = df['label'].values


# LSTM

In [None]:
y = df['label'].values

In [None]:
indices =  np.random.randint(low=0, high=x.shape[0], size=x.shape[0])
train_indices = indices[0:round(0.5*x.shape[0])]
pool_indices = indices[round(0.5*x.shape[0]):]
df_train = df.iloc[train_indices]['text'].values
df_test = df.iloc[pool_indices]['text'].values
y_train = y[train_indices]
y_test = y[pool_indices]

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df_train)

X_train = tokenizer.texts_to_sequences(df_train)
X_test = tokenizer.texts_to_sequences(df_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(df_train[2])
print(X_train[2])

In [None]:
max_features = vocab_size
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 16

print('Loading data...')
(x_train, y_train), (x_test, y_test) = (X_train, y_train), (X_test, y_test)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()
model.add(Embedding(vocab_size, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
classwise_uncertain = 1- model.predict_proba(x_test)


In [None]:
import shap

# we use the first 100 training examples as our background dataset to integrate over
explainer = shap.DeepExplainer(model, x_train[:100])

# explain the first 10 predictions
# explaining each prediction requires 2 * background dataset size runs
shap_values = explainer.shap_values(x_test[:100])

In [None]:
# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [None]:
shap.summary_plot(shap_values, x_test[:100], my_texts)

In [None]:
x_test

In [None]:
# transform the indexes to words
import numpy as np
words = reverse_word_map
num2word = {}
for w in words.keys():
    num2word[words[w]] = w
x_test_words = np.stack([np.array(list(map(lambda x: num2word.get(x, "NONE"), x_test[i]))) for i in range(10)])

# plot the explanation of the first prediction
# Note the model is "multi-output" because it is rank-2 but only has one column
shap.force_plot(explainer.expected_value[0], shap_values[0][0], x_test_words[0], matplotlib=True)

#### Clustering data
We are going to cluster the training data 
    - using shapely values (shapely space)
SHAP clustering works by clustering on Shapley values of each instance. 
This means that you cluster instances by explanation similarity

In [None]:
pca = PCA(n_components=2)
principals_pca = pca.fit_transform(shap_values_pool)

In [None]:
u = umap.UMAP(n_components=2, random_state=100,  metric='euclidean', n_neighbors=100, min_dist=1)
principals_umap = u.fit_transform(shap_values_pool)

In [None]:
tsne = TSNE(n_components=2, perplexity=20)
principals = tsne.fit_transform(
    shap_values_pool)

In [None]:
n_clusters = 20

In [None]:
kmeans = KMeans(n_clusters= n_clusters, n_jobs=-1, max_iter=600)
kmeans.fit(shap_values_pool)


In [None]:
print("Homogenity score", homogeneity_score(y_pool, kmeans.labels_))

We use cosine distance instead of euclidean distance to measure the similarity between the documents.
As the size of the document increases, the number of common words (euclidean) tend to increase 
even if the documents talk about different topics. The cosine similarity helps overcome this fundamental flaw 
and finds the similarity irrespective of size.

https://www.machinelearningplus.com/nlp/cosine-similarity/

In [None]:
# Find similarity of each point in cluster to its centroid
similarity_to_center = []
for i, instance in enumerate(shap_values_pool):
    cluster_label = kmeans.labels_[i] # cluster of this instance
    centroid = kmeans.cluster_centers_[cluster_label] # cluster center of the cluster of that instance
    similarity = 1-cosine(instance, centroid) # 1- cosine distance gives similarity
    similarity_to_center.append(similarity)

In [None]:
centroid_match = [None]*n_clusters
centroid_indices =[None]*n_clusters
for i, instance in enumerate(shap_values_pool):
    cluster_label = kmeans.labels_[i]     
    if centroid_match[cluster_label] is None or similarity_to_center[i] > centroid_match[cluster_label]:
        centroid_indices[cluster_label] = i
        centroid_match[cluster_label] = similarity_to_center[i]
        

In [None]:
data = []
collect = dict()
color = ['hsl(' + str(h) + ',80%' + ',50%)' for h in np.linspace(0, 255, n_clusters)]
# color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
#              for i in range(n_clusters)]
for cluster_id in np.unique(kmeans.labels_):
    cluster_indices = np.where(kmeans.labels_ == cluster_id)    
    cluster_text = df_pool[cluster_indices]
    center_index = centroid_indices[cluster_id]

    cp = principals[cluster_indices]
    data.append(go.Scatter(x = cp[:,0],
                   y = cp[:,1],
                   mode='markers',                    
                hovertext=cluster_text,
                           text = cluster_text,
                           textposition = 'middle right',
                            marker=dict(color=color[cluster_id],
                                                   size=10),
                           name = 'cluster '+ str(cluster_id)
                          ))
    data.append(go.Scatter(x = [principals[center_index, 0]],
                   y = [principals[center_index, 1]],
                           visible=True,
                   mode='markers',  
                           marker=dict(color=color[cluster_id],
                                                   size=15,
                                                   line=dict(color='black', width=5)),
                           name = 'centroid cluster '+ str(cluster_id)
                          ))
    collect[cluster_id] = df_pool[cluster_indices]
    
fig = go.Figure(data=data)
fig.show()
#plotly.offline.plot(fig)

In [None]:
#url=py.plot(fig, filename='bird', sharing='public')

In [None]:
centroid_indices

In [None]:
# for item in collect[11]:
#     print(item)

#### Propagate label of centroid to entire cluster

In [None]:
y_pool_new = np.zeros(shape=y_pool.shape)
for cluster_id in np.unique(kmeans.labels_):
    cluster_indices = np.where(kmeans.labels_ == cluster_id)    
    center_index = centroid_indices[cluster_id]
    center_label = y_pool[center_index]
    print(center_label)
    y_pool_new[cluster_indices] = center_label
    print(y_pool_new[cluster_indices])
y_new = np.zeros(shape=y.shape)
y_new[pool_indices] = y_pool_new

In [None]:
compare = (y_pool_new == y_pool)
np.where(compare==True)[0].shape

In [None]:
compare = (y_pool_new != y_pool)
np.where(compare==True)[0].shape

In [None]:
train_indices_new = np.append(pool_indices, centroid_indices)


In [None]:
model1 = LogisticRegression(penalty=PENALTY, C=C, max_iter=max_iter)
x_train_new = x[train_indices_new]
y_train_new = y[train_indices_new]
model1.fit(x_train_new, y_train_new)

In [None]:
model1.score(x_train_new, y_train_new), model1.score(x_test, y_test)

In [None]:
predictions1 = model1.predict(x_test)
f1_score(y_test, predictions1)

In [None]:
accuracy_score(y_test, predictions1)

#### Add entire x_pool back to training instead of just centroids

In [None]:
train_indices_full = np.append(train_indices, pool_indices)

In [None]:
model2 = LogisticRegression(penalty=PENALTY, C=C, max_iter=max_iter)
x_train_full = x[train_indices_full]
y_train_full = y[train_indices_full]
model2.fit(x_train_full, y_train_full)
model2.score(x_train_full, y_train_full), model2.score(x_test, y_test)

In [None]:
predictions2 = model2.predict(x_test)
f1_score(y_test, predictions2)

In [None]:
accuracy_score(y_test, predictions2)

#### Compare

In [None]:
print("Model with 20% train ", f1_score(y_test, predictions))
print("Model with 20% train + propagate center ", f1_score(y_test, predictions1))
print("Model with 20% train + 60% pool ", f1_score(y_test, predictions2))

In [None]:
print("Model with 20% train ", accuracy_score(y_test, predictions))
print("Model with 20% train + propagate center ", accuracy_score(y_test, predictions1))
print("Model with 20% train + 60% pool ", accuracy_score(y_test, predictions2))