# French documents classification
To build/extend the topic classification engine (Tessie) for predictions on French documents. And since french training data are rather scarce, we would leverage English data by using bilingual word vector representations. 

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import torch
import fasttext
from collections import Counter

In [2]:
from model import MLP
from FastText import FastVector
from eval import net_evaluation, predict

In [3]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)

from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# need plotly account to plot on your own
plotly.tools.set_credentials_file(username='', api_key='')

# 1 Dataset --eddy

In [4]:
en_train_path = "../dataset/eddy_data/en_train.tsv"
en_test_path = "../dataset/eddy_data/en_test.tsv"
fr_train_path = "../dataset/eddy_data/fr_train.tsv"
fr_test_path = "../dataset/eddy_data/fr_test.tsv"

#reading raw dataset
en_train = pd.read_csv(en_train_path, sep='\t', encoding='utf-8')
en_test = pd.read_csv(en_test_path, sep='\t', encoding='utf-8')
fr_train = pd.read_csv(fr_train_path, sep='\t', encoding='utf-8')
fr_test = pd.read_csv(fr_test_path, sep='\t', encoding='utf-8')

fr_train.head()

Unnamed: 0,topic_id,extracted_content
0,__label__1502408,"Des petits refuges pod, une yourte panoramique..."
1,__label__1501374,"DSI, vous bloquez sans doute les acces aux sys..."
2,__label__1502344,Samsung vient d'annoncer qu'il va commencer la...
3,__label__1342797,+33 (0)1 43 87 06 77 info@pixisoft.com +33 (0)...
4,__label__1500181,"1 Au Quebec, la formation technique en soins i..."


In [5]:
dataset_stats = {}
dataset_stats['Train + Valid']={"English":len(en_train), "French": len(fr_train)}
dataset_stats['Test']={"English":len(en_test), "French": len(fr_test)}
dataset_stats['Topics']={"English":len(set(en_train["topic_id"])), "French": len(set(fr_train["topic_id"]))}


dataset_stats = pd.DataFrame.from_dict(dataset_stats)
dataset_stats= dataset_stats[['Train + Valid', 'Test', "Topics"]]
dataset_stats

Unnamed: 0,Train + Valid,Test,Topics
English,57616,14405,923
French,11845,7898,923


In [6]:
# load topic_id dictionary
df_topics = pd.read_csv("../dataset/eddy_topics.csv")
id2name = pd.Series(df_topics.topic_name.values, index=df_topics.topic_id).to_dict()
id2theme = pd.Series(df_topics.theme_name.values, index=df_topics.topic_id).to_dict()
name2id = {v:k for (k,v) in id2name.items()}

eddy_labels = list(set(fr_train["topic_id"]))
eddy_labels = [int(label[9::]) for label in eddy_labels]
eddy_themes = [id2theme[label] for label in eddy_labels]

labels = list(Counter(eddy_themes).keys())
values = list(Counter(eddy_themes).values())
trace = go.Pie(labels=labels, values=values, textinfo='value')
iplot([trace], filename='eddy themes')

In [7]:
en_train_count = en_train.groupby('topic_id').count()
data = [go.Histogram(x=np.array(en_train_count['extracted_content']))]
layout=go.Layout(title="Distribution of number of English training samples", 
                 xaxis={'title':'num of training samples'}, yaxis={'title':'label count'})
figure=go.Figure(data=data,layout=layout)
iplot(figure, filename='en_train_hist')

In [8]:
fr_train_count = fr_train.groupby('topic_id').count()
data = [go.Histogram(x=np.array(fr_train_count['extracted_content']), marker={"color":"lightsalmon"})]
layout=go.Layout(title="Distribution of number of French training samples", 
                 xaxis={'title':'num of training samples'}, yaxis={'title':'label count'})
figure=go.Figure(data=data,layout=layout)
iplot(figure, filename='fr_train_hist')

# 2 FastText classifier (Baselines)

## 2.1 English data only
- trainset: en_train
- testset: en_test

In [9]:
#params setting: dim = 100, word_ngrams = 1, lr = 0.05, lr_update_rate = 100, minn = 2, maxn = 5, epoch = 300
classifier_en = fasttext.load_model('best_model_en.bin')

print("top1_acc:", 100*classifier_en.test('../dataset/eddy_data/en_test.tsv', k=1).recall)
print("top10_acc:", 100*classifier_en.test('../dataset/eddy_data/en_test.tsv', k=10).recall)

top1_acc: 62.44359597362027
top10_acc: 88.20548420687261


## 2.2 French data only
- trainset: fr_train
- testset: fr_test

In [10]:
#params setting: dim = 100, word_ngrams = 1, lr = 0.3, lr_update_rate = 100, minn = 0, maxn = 0, epoch = 300
classifier_fr = fasttext.load_model('best_model_fr.bin')

fasttext_1 = 100*classifier_fr.test('../dataset/eddy_data/fr_test.tsv', k=1).recall
fasttext_10 = 100*classifier_fr.test('../dataset/eddy_data/fr_test.tsv', k=10).recall

print("top1_acc:", fasttext_1)
print("top10_acc:", fasttext_10)

top1_acc: 22.0688781970119
top10_acc: 47.04988604710053


# 3 Data Preprocessing

## 3.1 Bilingual Word Embedding --MUSE iterative Procrustes alignment

Tools_reviews: https://bombora.atlassian.net/browse/DS-1557 

MUSE: https://github.com/facebookresearch/MUSE.git

In [11]:
with open("./embedding/en2fr.txt", "r") as file:
    lines = file.readlines()
bilingual_pairs = [tuple(line.rstrip('\n').split(' ')) for line in lines]

In [12]:
def display_wordembedding_tsnescatterplot(en_dict, fr_dict, bilingual_pairs, plot_title):
    
    word_vectors = np.empty((0,300), dtype='f')
    en_word_labels = []
    fr_word_labels = []

    for (en_word, fr_word) in bilingual_pairs:
        en_word_labels.append(en_word)
        word_vectors = np.append(word_vectors, en_dict[en_word].reshape(1,300), axis=0)
        fr_word_labels.append(fr_word)
        word_vectors = np.append(word_vectors, fr_dict[fr_word].reshape(1,300), axis=0)

    
    
    # find tsne coords for 2 dimensions
    tsne_matrix = TSNE(n_components=2, random_state=0).fit_transform(word_vectors)
    EN = tsne_matrix[::2, :]
    FR = tsne_matrix[1::2, :]
    
    trace_comp0 = go.Scatter(x=EN[:, 0], y=EN[:, 1], text=en_word_labels, mode='markers+text', 
                             marker=dict(size=10, line=dict(width=1), color="lightnavy"), 
                             name='English')
    
    trace_comp1 = go.Scatter(x=FR[:, 0], y=FR[:, 1], text=fr_word_labels, mode='markers+text', 
                             marker=dict(size=10, line=dict(width=1), color="lightsalmon"),
                             name='French')
    
    
    data_comp = [trace_comp0, trace_comp1]
    layout_comp = go.Layout(title=plot_title, hovermode='closest')
    fig_comp = go.Figure(data=data_comp, layout=layout_comp)
    iplot(fig_comp, filename=plot_title)

In [13]:
##monolingual word embedding trained on Wikipedia using fasttext skip-gram model
#load the monolingual word embedding
en_dictionary_mono = FastVector("./embedding/wiki.en.vec")
fr_dictionary_mono = FastVector("./embedding/wiki.fr.vec")

reading word vectors from ./embedding/wiki.en.vec
reading word vectors from ./embedding/wiki.fr.vec


In [14]:
display_wordembedding_tsnescatterplot(en_dictionary_mono, fr_dictionary_mono, bilingual_pairs, "Monolingual embedding space")

In [15]:
#load the bilingual word embedding --MUSE supervised
en_dictionary = FastVector("./embedding/muse2-en.txt")
fr_dictionary = FastVector("./embedding/muse2-fr.txt")

reading word vectors from ./embedding/muse2-en.txt
reading word vectors from ./embedding/muse2-fr.txt


In [16]:
display_wordembedding_tsnescatterplot(en_dictionary, fr_dictionary, bilingual_pairs, "Bilingual embedding space")

## 3.2 Document Embedding
Reference: An Autoencoder Approach to Learning Bilingual Word Representations, https://arxiv.org/abs/1402.1454

Let $W^e$ and $W^f$ be the bilingual word embedding matrices of English and French. Now, given a document d written in language $l \in \{e, f\}$ and containing $m$ words, $z_1, z_2,\cdots, z_m$, we
represent it as the tf-idf weighted sum of its words’ representation:

<center>$$\phi(d) = \sum_{i=1}^{m} tf-idf(w_i)\cdot W^lz_i$$</center>

# 4 MLP (Multi-layer Perceptron) classifier

In [17]:
#hyper-parameter setting
n_mlp_layer=1
n_hidden=2048
dropout = 0.8
batch_size = 124
learning_rate = 0.0001
learning_rate_lambda = [lambda epoch: 0.98 ** epoch]
optimizer = "RMSprop"
max_epochs = 1000

In [24]:
#load preprocessed french test data embedded by monolingual word embedding
with open("./preprocessed/fr_test_mono.dat", 'rb') as f0:
    mono_fr_test_y = pickle.load(f0)
    mono_fr_test_x = pickle.load(f0)
    
#load preprocessed english and french test data
with open("./preprocessed/fr_test_muse2.dat", 'rb') as f1:
    fr_test_y = pickle.load(f1)
    fr_test_x = pickle.load(f1)
with open("./preprocessed/en_test_muse2.dat", 'rb') as f2:
    en_test_y = pickle.load(f2)
    en_test_x = pickle.load(f2)

In [25]:
net = MLP(n_mlp_layer=n_mlp_layer, n_hidden=n_hidden, dropout = dropout, 
          n_class=len(set(fr_test_y)), n_emb=fr_test_x.shape[1], device="cpu")
net.main

Sequential(
  (0): Linear(in_features=300, out_features=2048, bias=True)
  (1): Dropout(p=0.8)
  (2): Tanh()
  (3): Linear(in_features=2048, out_features=923, bias=True)
  (4): LogSoftmax()
)

## 4.1 French data only
- trainset: fr_train
- testset: fr_test
- embedding: fastText Wikipedia embeddings (monolingual)

In [26]:
net.load_state_dict(torch.load('./checkpoints/mono_fr_98_2048_best.net', map_location=lambda storage, loc: storage))
monomlp_1, monomlp_10 = net_evaluation(net, mono_fr_test_x, mono_fr_test_y, batch_size=batch_size)
print("top1_acc: ", monomlp_1)
print("top10_acc: ", monomlp_10)

top1_acc:  36.37629779691061
top10_acc:  63.066599139022536


## 4.2 co-train French and English data
- trainset: en_train + fr_train
- testset: en_test, fr_test
- embedding: MUSE supervised (bilingual)

In [27]:
net.load_state_dict(torch.load('./checkpoints/00muse2_98_2048_best.net', map_location=lambda storage, loc: storage))
print("Evaluate on french test set:")
mlp_1, mlp_10 = net_evaluation(net, fr_test_x, fr_test_y, batch_size=batch_size)
print("top1_acc: ", mlp_1)
print("top10_acc: ", mlp_10)
print("\n")
print("Evaluate on english test set:")
mlp_1_en, mlp_10_en = net_evaluation(net, en_test_x, en_test_y, batch_size=batch_size)
print("top1_acc: ", mlp_1_en)
print("top10_acc: ", mlp_10_en)

Evaluate on french test set:
top1_acc:  37.27525955938212
top10_acc:  63.39579640415295


Evaluate on english test set:
top1_acc:  65.83825060742797
top10_acc:  89.65636931620965


### 4.2.1 Error Analysis

In [28]:
def get_acc_stats(y_test, y_pred):
    acc_stats = []
    for label in sorted(set(y_test)):
        total = 0
        correct = 0
        for i in range(len(y_test)):
            if y_test[i] == label:
                total += 1
                if y_pred[i] == label:
                    correct += 1
        if total == 0:
            print(label)
        acc_stats.append(correct/total)
    return np.array(acc_stats)

In [29]:
fr_predicted = predict(net, fr_test_x, batch_size=batch_size)
en_predicted = predict(net, en_test_x, batch_size=batch_size)
fr_acc_stats = get_acc_stats(fr_test_y, fr_predicted)
en_acc_stats = get_acc_stats(en_test_y, en_predicted)


trace1 = go.Histogram(x=en_acc_stats, opacity=0.75, name="English")
trace2 = go.Histogram(x=fr_acc_stats , opacity=0.75, name="French", marker=dict(color="salmon"))

    
data = [trace1, trace2]
layout=go.Layout(title="Distribution of accuracies of English and French testing samples", barmode='overlay',
                 xaxis={'title':'accuracy'}, yaxis={'title':'label count'})
figure=go.Figure(data=data,layout=layout)
iplot(figure, filename='testing acc histogram')

In [31]:
bad_labels=[]

for l,a in enumerate(fr_acc_stats):
    if a<0.1:
        bad_labels.append(l) 

with open('label_encoder.pkl', 'rb') as handle:
    label_encoder = pickle.load(handle)
    
label_decoder = dict((v,k) for k,v in label_encoder.items())
bad_labels = [label_decoder[bl] for bl in bad_labels]
#bad_labels = [int(bl[9:]) for bl in bad_labels]

#with open('new_bad_labels.pkl', 'wb') as fp:
#    pickle.dump(bad_labels, fp)
    
bad_labels_name = [id2name[int(bl[9:])] for bl in bad_labels]
bad_labels_name

['Simple Object Access Protocol (SOAP)',
 'Hardware',
 'Internationalization (i18n)',
 'Visual Solutions',
 'Speech Recognition',
 'EtherNet/IP Network',
 'Apache HBase',
 'VPN',
 'E-Signature Software',
 'IT as a Service (ITaaS)',
 'UPS Systems',
 'Oracle Hyperion',
 'Reporting Software',
 'Retina Network Security Scanner',
 'T-Systems',
 'HP Jet Intelligence',
 'Data Mining',
 'Cloud VC',
 'OLAP',
 'Google Forms',
 'Remote Network Management',
 '4K (Display)',
 'Source Code Analysis',
 'C++',
 'Eclipse IDE',
 'Toner Cartidge',
 'eDiscovery Software',
 'Used Hardware',
 'Robot Locomotion',
 'Website Translation',
 'Remote Networking',
 'Software Developers',
 'jQuery',
 'Business Continuity',
 'Linux',
 'Beacon Technology',
 'Digital Assessment',
 'Fraud Protection',
 'Iterative Software Development',
 'C#',
 'Data Networks',
 'Microsoft Team Foundation Server (TFS)',
 'Classification',
 'High Performance Computing',
 'Outlook.com',
 'Application Delivery',
 'HP SureSupply',
 'Cloud O

In [32]:
en_train_bad = en_train[en_train["topic_id"].isin(bad_labels)]
data = [go.Histogram(x=np.array(en_train_bad.groupby('topic_id').count()['extracted_content']))]
layout=go.Layout(title="Distribution of number of bad English training samples", 
                 xaxis={'title':'num of training samples'}, yaxis={'title':'label count'})
figure=go.Figure(data=data,layout=layout)
iplot(figure, filename='en_train_bad_hist')

In [33]:
fr_train_bad = fr_train[fr_train["topic_id"].isin(bad_labels)]
data = [go.Histogram(x=np.array(fr_train_bad.groupby('topic_id').count()['extracted_content']), 
                     marker={"color":"lightsalmon"})]
layout=go.Layout(title="Distribution of number of bad French training samples", 
                 xaxis={'title':'num of training samples'}, yaxis={'title':'label count'})
figure=go.Figure(data=data,layout=layout)
iplot(figure, filename='fr_train_bad_hist')

In [34]:
fasttext_pred = classifier_fr.predict(fr_test["extracted_content"].values, k=1)
fasttext_pred = [output[0] for output in fasttext_pred]
fasttext_acc_stats = get_acc_stats(fr_test["topic_id"].values, fasttext_pred)

    
data = [go.Histogram(x=fasttext_acc_stats, marker={"color":"darksalmon"})]
layout=go.Layout(title="Distribution of accuracies of fastText French testing samples", barmode='overlay',
                 xaxis={'title':'accuracy'}, yaxis={'title':'label count'})
figure=go.Figure(data=data,layout=layout)
iplot(figure, filename='fasttext testing acc histogram')

In [35]:
bad_labels_fasttext=[]
label_squence = sorted(set(fr_test["topic_id"].values))

for idx, a in enumerate(fasttext_acc_stats):
    if a<0.05:
        bad_labels_fasttext.append(label_squence[idx]) 

bad_bad_labels=set(bad_labels).intersection(set(bad_labels_fasttext))
bad_bad_labels_name = [id2name[int(bl[9:])] for bl in bad_bad_labels]
bad_bad_labels_name

['HP Jet Intelligence',
 'Security Tools',
 'Hardware',
 'Application Management',
 'Document Translation',
 'Remote Network Management',
 'Gigabit Services',
 'Enterprise Database',
 'Toner Cartidge',
 'IT Management',
 'Application Delivery',
 'Web-Scale IT',
 'Used Hardware',
 'Bandwidth',
 'Low Latency Ethernet',
 'Retina Network Security Scanner',
 'Remote Networking',
 'High Performance Computing',
 'Ethernet Networking Solutions',
 'Multi-Touch Displays',
 'eDiscovery Software',
 'Intrusion Prevention',
 'Data Networks',
 'IT as a Service (ITaaS)',
 'Cloud VC',
 'Iterative Software Development',
 'Testing and Analysis',
 'Partner Management Software']

## 4.3 Loop in brasilia data 

### 4.3.1 brasilia dataset

In [36]:
en_train_b_path = "../dataset/brasilia_data/en_train_b.tsv"
en_test_b_path = "../dataset/brasilia_data/en_test_b.tsv"
fr_train_b_path = "../dataset/brasilia_data/fr_train_b.tsv"
fr_test_b_path = "../dataset/brasilia_data/fr_test_b.tsv"

#reading raw brasilia dataset
en_train_b = pd.read_csv(en_train_b_path, sep='\t', encoding='utf-8')
en_test_b = pd.read_csv(en_test_b_path, sep='\t', encoding='utf-8')
fr_train_b = pd.read_csv(fr_train_b_path, sep='\t', encoding='utf-8')
fr_test_b = pd.read_csv(fr_test_b_path, sep='\t', encoding='utf-8')

In [37]:
dataset_stats_b = {}
dataset_stats_b['Train + Valid']={"English":len(en_train_b), "French": len(fr_train_b)}
dataset_stats_b['Test']={"English":len(en_test_b), "French": len(fr_test_b)}
dataset_stats_b['Topics']={"English":len(set(en_train_b["topic_id"])), "French": len(set(fr_train_b["topic_id"]))}


dataset_stats_b = pd.DataFrame.from_dict(dataset_stats_b)
dataset_stats_b = dataset_stats_b[['Train + Valid', 'Test', "Topics"]]
dataset_stats_b

Unnamed: 0,Train + Valid,Test,Topics
English,70236,17559,1028
French,5550,3701,1028


In [38]:
brasilia_labels = list(set(fr_train_b["topic_id"]))
brasilia_labels = [int(label[9::]) for label in brasilia_labels]
brasilia_themes = [id2theme[label] for label in brasilia_labels]

labels = list(Counter(brasilia_themes).keys())
values = list(Counter(brasilia_themes).values())
trace = go.Pie(labels=labels, values=values, textinfo='value')
iplot([trace], filename='brasilia themes')

### 4.3.2 Use MLP model to train brasilia and eddy data together
- trainset: brasilia_en_train + brasilia_fr_train + eddy_en_train + eddy_fr_train
- testset: brasilia_en_test + eddy_en_test, brasilia_fr_test + eddy_fr_test
- embedding: MUSE supervised

In [39]:
#load preprocessed eddy and brasilia test data
with open("./preprocessed/en_test_muse2_wb.dat", 'rb') as f3:
    wb_en_test_y = pickle.load(f3)
    wb_en_test_x = pickle.load(f3)
with open("./preprocessed/fr_test_muse2_wb.dat", 'rb') as f4:
    wb_fr_test_y = pickle.load(f4)
    wb_fr_test_x = pickle.load(f4)

In [40]:
net = MLP(n_mlp_layer=n_mlp_layer, n_hidden=n_hidden, dropout = dropout, 
          n_class=len(set(wb_fr_test_y)), n_emb=wb_fr_test_x.shape[1], device="cpu")
net.load_state_dict(torch.load('./checkpoints/muse2_98_2048_best.net', map_location=lambda storage, loc: storage))

print("Evaluate on french test set:")
wbmlp_1, wbmlp_10 = net_evaluation(net, wb_fr_test_x, wb_fr_test_y, batch_size=batch_size)
print("top1_acc: ", wbmlp_1)
print("top10_acc: ", wbmlp_10)
print("\n")
print("Evaluate on english test set:")
wbmlp_1_en, wbmlp_10_en = net_evaluation(net, wb_en_test_x, wb_en_test_y, batch_size=batch_size)
print("top1_acc: ", wbmlp_1_en)
print("top10_acc: ", wbmlp_10_en)


Evaluate on french test set:
top1_acc:  35.11509612897664
top10_acc:  61.60875937580826


Evaluate on english test set:
top1_acc:  68.63033412589162
top10_acc:  90.73645351019898


# Summary

In [41]:
x = ['fastText', 'monofrench_MLP_eddy', 'MLP_eddy', 'MLP_eddy_and_brasilia']
top1 = [round(fasttext_1,2), round(monomlp_1,2), round(mlp_1,2), round(wbmlp_1,2)]
top10 = [round(fasttext_10,2), round(monomlp_10,2), round(mlp_10,2), round(wbmlp_10,2)]

trace1 = go.Bar(x=x,y=top1,text=top1, textposition = 'auto', name="Top1_acc",
                marker=dict(color='darkorange', line=dict(color='brown',width=1.5)), opacity=0.75)

trace2 = go.Bar(x=x, y=top10, text=top10, textposition = 'auto', name="Top10_acc",
                marker=dict(color='peachpuff', line=dict(color='brown', width=1.5)), opacity=0.75)

data = [trace1,trace2]
layout = go.Layout(title='Summary of Performance on French testset ')
figure = go.Figure(data=data, layout=layout)
iplot(figure, filename='performance_summary')

# After screening data

In [42]:
#load preprocessed english and french test data
with open("./preprocessed_srn/fr_test_muse2.dat", 'rb') as f1:
    fr_test_y = pickle.load(f1)
    fr_test_x = pickle.load(f1)
with open("./preprocessed_srn/en_test_muse2.dat", 'rb') as f2:
    en_test_y = pickle.load(f2)
    en_test_x = pickle.load(f2)

In [43]:
net = MLP(n_mlp_layer=n_mlp_layer, n_hidden=n_hidden, dropout = dropout, 
          n_class=len(set(fr_test_y)), n_emb=fr_test_x.shape[1], device="cpu")
net.main

Sequential(
  (0): Linear(in_features=300, out_features=2048, bias=True)
  (1): Dropout(p=0.8)
  (2): Tanh()
  (3): Linear(in_features=2048, out_features=783, bias=True)
  (4): LogSoftmax()
)

In [44]:
net.load_state_dict(torch.load('./checkpoints/srn_2048_best.net', map_location=lambda storage, loc: storage))
print("Evaluate on french test set:")
mlp_1, mlp_10 = net_evaluation(net, fr_test_x, fr_test_y, batch_size=batch_size)
print("top1_acc: ", mlp_1)
print("top10_acc: ", mlp_10)
print("\n")
print("Evaluate on english test set:")
mlp_1_en, mlp_10_en = net_evaluation(net, en_test_x, en_test_y, batch_size=batch_size)
print("top1_acc: ", mlp_1_en)
print("top10_acc: ", mlp_10_en)

Evaluate on french test set:
top1_acc:  44.01431359791802
top10_acc:  72.62524398178269


Evaluate on english test set:
top1_acc:  68.16326530612244
top10_acc:  90.93061224489796


In [45]:
fr_predicted = predict(net, fr_test_x, batch_size=batch_size)
en_predicted = predict(net, en_test_x, batch_size=batch_size)
fr_acc_stats = get_acc_stats(fr_test_y, fr_predicted)
en_acc_stats = get_acc_stats(en_test_y, en_predicted)


trace1 = go.Histogram(x=en_acc_stats, opacity=0.75, name="English")
trace2 = go.Histogram(x=fr_acc_stats , opacity=0.75, name="French", marker=dict(color="salmon"))

    
data = [trace1, trace2]
layout=go.Layout(title="Distribution of accuracies of English and French testing samples", barmode='overlay',
                 xaxis={'title':'accuracy'}, yaxis={'title':'label count'})
figure=go.Figure(data=data,layout=layout)
iplot(figure, filename='testing acc histogram')

In [46]:
bad_labels=[]

for l,a in enumerate(fr_acc_stats):
    if a<0.1:
        bad_labels.append(l) 

with open('label_encoder.pkl', 'rb') as handle:
    label_encoder = pickle.load(handle)
    
label_decoder = dict((v,k) for k,v in label_encoder.items())
bad_labels = [label_decoder[bl] for bl in bad_labels]
#bad_labels = [int(bl[9:]) for bl in bad_labels]

#with open('new_bad_labels.pkl', 'wb') as fp:
#    pickle.dump(bad_labels, fp)
    
bad_labels_name = [id2name[int(bl[9:])] for bl in bad_labels]
bad_labels_name

['Web Application Security',
 'Vertical Applications',
 'Speech Recognition',
 'Identity Proofing',
 'Data Deduplication',
 'Knowledge-Based Authentication (KBA)',
 'Document Security',
 'Enterprise Labeling Software',
 'Source Code Analysis',
 'Terminal / Network Computers',
 'Apache Spark',
 'Enterprise Contact Center',
 'Drupal',
 'Grid Computing',
 'Enterprise Information Integration / Metadata Management',
 'Security Storage',
 'Secure Sockets Layer (SSL)',
 'Software Developers',
 'External Hard Drive',
 'Integrated Development Environments',
 'Robotic Sensing',
 'Contact Center Outsourcing',
 'Amazon Kinesis',
 'Joomla',
 'Google Slides',
 'Application Software',
 'Cyber Security Framework',
 'Wireless Application Protocol (WAP)',
 'Computer Aided Engineering',
 'Collaboration Portal',
 'Cyber Risk Management',
 'Virtual Servers',
 'B2B Secure Collaboration',
 'Desktop as a Service (DaaS)',
 'High Speed Networks',
 'Customer Self Service',
 'Geographic Information Systems',
 'So