In [1]:
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.cluster import KMeans
from visualize import *
import torch

In [2]:
path = 'trained_models/classifier/viz/bestgooglemodel.pt'
data = {}
trained_model = torch.load(path, map_location = lambda storage, loc: storage)
data['words'], data['vals'], data['labels'], data['groundtruth'] = extract(path)

In [3]:
words = [list(filter(lambda x: x!= '<pad>', a)) for a in data['words']]
words

[['<BOS>', 'hunt', 'continues', '<EOS>'],
 ['<BOS>', 'international', 'concern', '<EOS>'],
 ['<BOS>', 'hungry', '.', '<EOS>'],
 ['<BOS>', 'hot', '.', '<EOS>'],
 ['<BOS>', 'back', '.', '<EOS>'],
 ['<BOS>', 'hand', '.', '<EOS>'],
 ['<BOS>', 'arm', '.', '<EOS>'],
 ['<BOS>', 'bathroom', '.', '<EOS>'],
 ['<BOS>', 'leg', '.', '<EOS>'],
 ['<BOS>', 'cold', '.', '<EOS>'],
 ['<BOS>', 'thirsty', '.', '<EOS>'],
 ['<BOS>', 'urinate', '.', '<EOS>'],
 ['<BOS>', 'no', 'answers', '<EOS>'],
 ['<BOS>', 'amen', '.', '<EOS>'],
 ['<BOS>', 'controversial', 'appointments', '<EOS>'],
 ['<BOS>', 'boycott', 'unfair', '<EOS>'],
 ['<BOS>', 'personal', 'incomes', '<EOS>'],
 ['<BOS>', 'praetorian', 'promotion', '<EOS>'],
 ['<BOS>', 'background', '<EOS>'],
 ['<BOS>', 'facts', '<EOS>'],
 ['<BOS>', 'background', '<EOS>'],
 ['<BOS>', 'facts', '<EOS>'],
 ['<BOS>', 'trends', '<EOS>'],
 ['<BOS>', 'mosquitoes', '<EOS>'],
 ['<BOS>', 'advocate', '<EOS>'],
 ['<BOS>', 'dialogue', '<EOS>'],
 ['<BOS>', 'agriculture', '<EOS>'],
 [

In [4]:
print(len(words))

8244


In [None]:
## average word length
sum([len(a) for a in words])/len(words)

In [6]:
trained_model.keys()

dict_keys(['train_weights', 'best_valid_accuracy', 'epoch', 'vocab', 'labels', 'optimizer', 'accuracies', 'state_dict', 'test_weights'])

In [6]:
hiddens = trained_model['train_weights']['hidden']
print(hiddens.shape)

torch.Size([8244, 300])


In [7]:

linear_reducer = PCA()
linear_reducer.fit(hiddens)
linear_reducer.explained_variance_ratio_[:5]

array([ 0.64190219,  0.05555807,  0.04960111,  0.03438417,  0.02937376])

In [10]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

scores = []
best_score = -1
best_clusters = None
best_k = None
for k in range(2, 50):
    km = KMeans(n_clusters = k)
    X = km.fit_predict(hiddens)
    score = silhouette_score(hiddens, X)
    scores.append(score)
    if score > best_score:
        best_clusters = X
        best_score = score
        best_k = k
        
print(best_score)

0.490198


In [None]:
print(best_k)
print(scores)

In [12]:
len(best_clusters)

8244

In [13]:
len(trained_model['train_weights']['targets'])

8244

In [14]:
sum(best_clusters == trained_model['train_weights']['targets'])

7788

In [15]:
(trained_model['train_weights']['preds'].nonzero())


    0
    2
    3
  ⋮   
 8192
 8203
 8218
[torch.LongTensor of size 2933x1]

In [33]:
import plotly.plotly as py
import plotly.graph_objs as go


# Create random data with numpy
import numpy as np

N = 1000
random_x = np.random.randn(N)
random_y = np.random.randn(N)

# Create a trace
trace = go.Scatter(
    x = random_x,
    y = random_y,
    mode = 'markers', 
    hovertext = 250 * ['a', 'b', 'c', 'd']
)

data = [trace]

# Plot and embed in ipython notebook!
py.iplot(data)

In [9]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

perplexities = range(5, 55, 10)

reduced_data = {}

for p in perplexities:
    
    print('Training T-SNE on hidden states for perplexity:{}...'.format(p))
    reducer = TSNE(n_components = 2, perplexity = p)
    reduced_data[p] = reducer.fit_transform(hiddens)

Training T-SNE on hidden states for perplexity:5...
Training T-SNE on hidden states for perplexity:15...
Training T-SNE on hidden states for perplexity:25...
Training T-SNE on hidden states for perplexity:35...
Training T-SNE on hidden states for perplexity:45...


In [11]:
import plotly.plotly as py
import plotly.graph_objs as go


# Create random data with numpy
import numpy as np

N = 1000
random_x = np.random.randn(N)
random_y = np.random.randn(N)
random_x2 = np.random.randn(N)
random_y2 = np.random.randn(N)

# Create a trace
trace = go.Scatter(
    x = random_x,
    y = random_y,
    mode = 'markers',
    hovertext = 250 * ['a', 'b', 'c', 'd'],
    hoverinfo = 'text',
    marker = go.Marker(
                size = 4,
                color = 'blue'
    ),
    name = 'subjective'
)
trace2 = go.Scatter(
    x = random_x2,
    y = random_y2,
    mode = 'markers',
    hovertext = 250 * ['a', 'b', 'c', 'd'],
    hoverinfo = 'text',
    marker = go.Marker(
                size = 4,
                color = 'red'
    ),
    name = 'objective'
)

layout = go.Layout(
    xaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    ),
    yaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    )
)

data = [trace, trace2]

fig = go.Figure(data = data, layout = layout)

# Plot and embed in ipython notebook!
print(trace.help('hovertext'))
py.iplot(fig, filename = 'test')

Current path: []
Current parent object_names: []

With the current parents, 'hovertext' can be used as follows:

Under ('figure', 'data', 'scatter'):

    description: Sets hover text elements associated with each (x,y) pair. If a single
        string, the same string appears over all the data points. If an array of
        string, the items are mapped in order to the this trace's (x,y)
        coordinates. To be seen, trace `hoverinfo` must contain a *text* flag.
    editType: style
    role: info



None


In [35]:
p = 5
d = reduced_data[p]
subjective_inds = trained_model['train_weights']['preds'].nonzero()
subjective_inds = subjective_inds
subjective_sents = np.squeeze(d[subjective_inds])
objective_inds = (trained_model['train_weights']['preds'] == 0).nonzero()
objective_inds = objective_inds
objective_sents = np.squeeze(d[objective_inds])

texts = [' '.join(l) for l in words]
subjective_texts = [text for i, text in enumerate(texts) if torch.sum(subjective_inds == i)]
objective_texts = [text for i, text in enumerate(texts) if torch.sum(objective_inds == i)]

print("SUBJECTIVE")
print(len(subjective_texts))
print(subjective_sents.shape)
print("OBJECTIVE")
print(len(objective_texts))
print(objective_sents.shape)
subjective = go.Scatter(
                x = subjective_sents[:, 0], 
                y = subjective_sents[:, 1], 
                mode = 'markers',
                text = subjective_texts,
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'red', 
                            size = 4
                ),
                name = 'Subjective'
            )
objective = go.Scatter(
                x = objective_sents[:, 0], 
                y = objective_sents[:, 1], 
                mode = 'markers',
                text = objective_texts, 
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'blue', 
                            size = 4
                ),
                name = 'Objective'
            )

data = [subjective, objective]

layout = go.Layout(
    title = "t-SNE for perplexity: {}".format(p),
    hovermode = 'closest',
    xaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    ),
    yaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    )
)

fig = go.Figure(data = data, layout = layout)
print('here')

# Plot and embed in ipython notebook!
py.iplot(fig, filename = 'Subjectivity Representations')

KeyError: 5

In [34]:
p = 15
d = reduced_data[p]
subjective_inds = trained_model['train_weights']['preds'].nonzero()
subjective_inds = subjective_inds
subjective_sents = np.squeeze(d[subjective_inds])
objective_inds = (trained_model['train_weights']['preds'] == 0).nonzero()
objective_inds = objective_inds
objective_sents = np.squeeze(d[objective_inds])

texts = [' '.join(l) for l in words]
subjective_texts = [text for i, text in enumerate(texts) if torch.sum(subjective_inds == i)]
objective_texts = [text for i, text in enumerate(texts) if torch.sum(objective_inds == i)]

print("SUBJECTIVE")
print(len(subjective_texts))
print(subjective_sents.shape)
print("OBJECTIVE")
print(len(objective_texts))
print(objective_sents.shape)
subjective = go.Scatter(
                x = subjective_sents[:, 0], 
                y = subjective_sents[:, 1], 
                mode = 'markers',
                text = subjective_texts,
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'red', 
                            size = 4
                ),
                name = 'Subjective'
            )
objective = go.Scatter(
                x = objective_sents[:, 0], 
                y = objective_sents[:, 1], 
                mode = 'markers',
                text = objective_texts, 
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'blue', 
                            size = 4
                ),
                name = 'Objective'
            )

data = [subjective, objective]

layout = go.Layout(
    title = "t-SNE for perplexity: {}".format(p),
    hovermode = 'closest',
    xaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    ),
    yaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    )
)

fig = go.Figure(data = data, layout = layout)
print('here')

# Plot and embed in ipython notebook!
py.iplot(fig, filename = 'Subjectivity Representations')
    

KeyError: 35

In [None]:
p = 25
d = reduced_data[p]
subjective_inds = trained_model['train_weights']['preds'].nonzero()
subjective_inds = subjective_inds
subjective_sents = np.squeeze(d[subjective_inds])
objective_inds = (trained_model['train_weights']['preds'] == 0).nonzero()
objective_inds = objective_inds
objective_sents = np.squeeze(d[objective_inds])

texts = [' '.join(l) for l in words]
subjective_texts = [text for i, text in enumerate(texts) if torch.sum(subjective_inds == i)]
objective_texts = [text for i, text in enumerate(texts) if torch.sum(objective_inds == i)]

print("SUBJECTIVE")
print(len(subjective_texts))
print(subjective_sents.shape)
print("OBJECTIVE")
print(len(objective_texts))
print(objective_sents.shape)
subjective = go.Scatter(
                x = subjective_sents[:, 0], 
                y = subjective_sents[:, 1], 
                mode = 'markers',
                text = subjective_texts,
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'red', 
                            size = 4
                ),
                name = 'Subjective'
            )
objective = go.Scatter(
                x = objective_sents[:, 0], 
                y = objective_sents[:, 1], 
                mode = 'markers',
                text = objective_texts, 
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'blue', 
                            size = 4
                ),
                name = 'Objective'
            )

data = [subjective, objective]

layout = go.Layout(
    title = "t-SNE for perplexity: {}".format(p),
    hovermode = 'closest',
    xaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    ),
    yaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    )
)

fig = go.Figure(data = data, layout = layout)
print('here')

# Plot and embed in ipython notebook!
py.iplot(fig, filename = 'Subjectivity Representations')
    

In [29]:
p = 35
d = reduced_data[p]
subjective_inds = trained_model['train_weights']['preds'].nonzero()
subjective_inds = subjective_inds
subjective_sents = np.squeeze(d[subjective_inds])
objective_inds = (trained_model['train_weights']['preds'] == 0).nonzero()
objective_inds = objective_inds
objective_sents = np.squeeze(d[objective_inds])

texts = [' '.join(l) for l in words]
subjective_texts = [text for i, text in enumerate(texts) if torch.sum(subjective_inds == i)]
objective_texts = [text for i, text in enumerate(texts) if torch.sum(objective_inds == i)]

print("SUBJECTIVE")
print(len(subjective_texts))
print(subjective_sents.shape)
print("OBJECTIVE")
print(len(objective_texts))
print(objective_sents.shape)
subjective = go.Scatter(
                x = subjective_sents[:, 0], 
                y = subjective_sents[:, 1], 
                mode = 'markers',
                text = subjective_texts,
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'red', 
                            size = 4
                ),
                name = 'Subjective'
            )
objective = go.Scatter(
                x = objective_sents[:, 0], 
                y = objective_sents[:, 1], 
                mode = 'markers',
                text = objective_texts, 
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'blue', 
                            size = 4
                ),
                name = 'Objective'
            )

data = [subjective, objective]

layout = go.Layout(
    title = "t-SNE for perplexity: {}".format(p),
    hovermode = 'closest',
    xaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    ),
    yaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    )
)

fig = go.Figure(data = data, layout = layout)
print('here')

# Plot and embed in ipython notebook!
py.iplot(fig, filename = 'Subjectivity Representations')
    

SUBJECTIVE
2933
(2933, 2)
OBJECTIVE
5311
(5311, 2)
here


In [32]:
p = 45
d = reduced_data[p]
subjective_inds = trained_model['train_weights']['preds'].nonzero()
subjective_inds = subjective_inds
subjective_sents = np.squeeze(d[subjective_inds])
objective_inds = (trained_model['train_weights']['preds'] == 0).nonzero()
objective_inds = objective_inds
objective_sents = np.squeeze(d[objective_inds])

texts = [' '.join(l) for l in words]
subjective_texts = [text for i, text in enumerate(texts) if torch.sum(subjective_inds == i)]
objective_texts = [text for i, text in enumerate(texts) if torch.sum(objective_inds == i)]

print("SUBJECTIVE")
print(len(subjective_texts))
print(subjective_sents.shape)
print("OBJECTIVE")
print(len(objective_texts))
print(objective_sents.shape)
subjective = go.Scatter(
                x = subjective_sents[:, 0], 
                y = subjective_sents[:, 1], 
                mode = 'markers',
                text = subjective_texts,
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'red', 
                            size = 4
                ),
                name = 'Subjective'
            )
objective = go.Scatter(
                x = objective_sents[:, 0], 
                y = objective_sents[:, 1], 
                mode = 'markers',
                text = objective_texts, 
                hoverinfo = text, 
                marker = go.Marker(
                            color = 'blue', 
                            size = 4
                ),
                name = 'Objective'
            )

data = [subjective, objective]

layout = go.Layout(
    title = "t-SNE for perplexity: {}".format(p),
    hovermode = 'closest',
    xaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    ),
    yaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    )
)

fig = go.Figure(data = data, layout = layout)
print('here')

# Plot and embed in ipython notebook!
py.iplot(fig, filename = 'Subjectivity Representations')
    

SUBJECTIVE
2933
(2933, 2)
OBJECTIVE
5311
(5311, 2)
here


In [38]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

perplexities = range(55,75, 10)

reduced_data = {}

for p in perplexities:
    
    print('Training T-SNE on hidden states for perplexity:{}...'.format(p))
    reducer = TSNE(n_components = 3, perplexity = p)
    reduced_data[p] = reducer.fit_transform(hiddens)

Training T-SNE on hidden states for perplexity:55...
Training T-SNE on hidden states for perplexity:65...


In [42]:
p = 55
d = reduced_data[p]
subjective_inds = trained_model['train_weights']['preds'].nonzero()
subjective_inds = subjective_inds
subjective_sents = np.squeeze(d[subjective_inds])
objective_inds = (trained_model['train_weights']['preds'] == 0).nonzero()
objective_inds = objective_inds
objective_sents = np.squeeze(d[objective_inds])

texts = [' '.join(l) for l in words]
subjective_texts = [text for i, text in enumerate(texts) if torch.sum(subjective_inds == i)]
objective_texts = [text for i, text in enumerate(texts) if torch.sum(objective_inds == i)]

print("SUBJECTIVE")
print(len(subjective_texts))
print(subjective_sents.shape)
print("OBJECTIVE")
print(len(objective_texts))
print(objective_sents.shape)
subjective = go.Scatter3d(
                x = subjective_sents[:, 0], 
                y = subjective_sents[:, 1], 
                z = subjective_sents[:, 2], 
                mode = 'markers',
                text = subjective_texts,
                hoverinfo = 'text', 
                marker = go.Marker(
                            color = 'red', 
                            size = 2
                ),
                name = 'Subjective'
            )
objective = go.Scatter3d(
                x = objective_sents[:, 0], 
                y = objective_sents[:, 1], 
                z = objective_sents[:, 2], 
                mode = 'markers',
                text = objective_texts, 
                hoverinfo = 'text', 
                marker = go.Marker(
                            color = 'blue', 
                            size = 2
                ),
                name = 'Objective'
            )

data = [subjective, objective]

layout = go.Layout(
    title = "t-SNE for perplexity: {}".format(p),
    hovermode = 'closest',
    xaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    ),
    yaxis=dict(
        autorange=True,
        showgrid=False,
        zeroline=False,
        showline=False,
        autotick=True,
        ticks='',
        showticklabels=False
    )
)

fig = go.Figure(data = data, layout = layout)
print('here')

# Plot and embed in ipython notebook!
py.iplot(fig, filename = 'Subjectivity Representations')

SUBJECTIVE
2933
(2933, 3)
OBJECTIVE
5311
(5311, 3)
here
