In [1]:
import importlib
import os
from joblib import dump, load

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split

import spacy

import ml.prepare as mlprepare

data_path = "..\\data\\stackoverflow\\"
np.set_printoptions(precision=2, suppress=True)
pd.options.display.float_format = "{:,.2f}".format

In [2]:
def filter_top_tags(df, column, count=10, default=None):
    top_tags = mlprepare.prepare_multi_label(df,'Tags')[:count]
    ret = df[column].apply(lambda cell: [x for x in cell if x in top_tags])
    if default:
        ret = ret.apply(lambda cell: cell if len(cell)>0 else ['other'])
    return ret

def nltk_post_tokens(row):
#     title_tokens = tokenizer.tokenize(row['Title'].lower())
#     title_tokens = [ stemmer.stem(w) for w in tokenizer.tokenize(row['Title'].lower()) if w not in nltk_stop_words ] 
#     body_tokens = [ stemmer.stem(w) for w in tokenizer.tokenize(row['bs4'].lower()) if w not in nltk_stop_words ]
    post_tags = [ tag.name for tag in BeautifulSoup(row['Tags'], 'html.parser').find_all() if tag.name in valid_tags ]
#     return title_tokens, body_tokens, post_tags
    return post_tags

In [3]:
df = pd.read_pickle(data_path + 'bs4.pkl')
# df_base = pd.read_pickle(data_path + 'top50_tags.pkl')

In [4]:
tags_filename = "Stackoverflow_top_tags.csv"
df_tags = pd.read_csv(data_path + tags_filename)
valid_tags = df_tags['TagName'].tolist()

df['Tags'] = df.apply(nltk_post_tokens, axis=1)
df['Tags'] = filter_top_tags(df,'Tags', count=50, default='other')
df = df[['Title','bs4','Tags']]
df

Unnamed: 0,Title,bs4,Tags
0,How to determine the smallest common divisor o...,I was asked the following question during a jo...,[string]
1,How to use a trait object that contains a meth...,What's the correct way to use a trait object t...,[other]
2,Why a named function declaration isn't hoisted...,is putting the function declaration beside the...,[javascript]
3,Getting value of input causing flickering in R...,I am attempting to track the value of two logi...,"[javascript, css, reactjs]"
4,How to assign a vector of atomic types?,How can I assign the members of a vector with ...,[c++]
...,...,...,...
49995,NullReferenceException in SimpleInjector.Conta...,I'm still in the process of getting into WPF a...,[c#]
49996,How can I add grid lines to a catplot in seaborn?,How can I add grid lines (vertically and horiz...,[python]
49997,Standard deviation only for data that meets tw...,I am trying to calculate the standard deviatio...,[excel]
49998,How to properly raise exception in TensorFlow ...,I want to raise a exception dependent on the ...,"[python, python-3.x]"


In [5]:
# categories = df.groupby('Tags').size().sort_values(ascending=True)
categories = mlprepare.prepare_multi_label(df,'Tags')
categories[:10]

python        8882
other         6839
javascript    5558
r             4045
pandas        3300
java          2917
c++           2688
c#            2632
python-3.x    2173
sql           2032
dtype: int64

In [6]:
df = df[['bs4','Tags']]
df

Unnamed: 0,bs4,Tags
0,I was asked the following question during a jo...,[string]
1,What's the correct way to use a trait object t...,[other]
2,is putting the function declaration beside the...,[javascript]
3,I am attempting to track the value of two logi...,"[javascript, css, reactjs]"
4,How can I assign the members of a vector with ...,[c++]
...,...,...
49995,I'm still in the process of getting into WPF a...,[c#]
49996,How can I add grid lines (vertically and horiz...,[python]
49997,I am trying to calculate the standard deviatio...,[excel]
49998,I want to raise a exception dependent on the ...,"[python, python-3.x]"


In [7]:
data_train, data_test = train_test_split(df, test_size=.1, random_state=31)

In [8]:
df.shape

(50000, 2)

In [9]:
data_test.shape

(5000, 2)

In [27]:
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en")

In [12]:
nlp.max_length=2000000
# doc_title = nlp(df_base['Title'][:10000].str.cat(sep='. '))

# doc_body = nlp(df_base['bs4'][:3000].str.cat(sep='. '))

In [10]:
for label in categories.index[:10]:
    textcat.add_label(label)

NameError: name 'textcat' is not defined

In [11]:
data_train_spacy = list(zip(
    df.bs4,
    df.Tags.apply(
        lambda cat: {'cats': { c: float(c == cat)
                             for c in categories.index[:10]}})
))

In [12]:
data_train_spacy

[("I was asked the following question during a job interview and was stumped by it. Part of the problem I had is making up my mind about what problem I was solving. At first I didn't think the question was internally consistent but then I realized it is asking you to solve two different things - the first task is to figure out whether one string contains a multiple of another string. But the second task is to find a smaller unit of division within both strings. It's a bit more clear to me now with the pressure of the interview room behind me but I'm still not sure what the ideal algorithm would be here. Any suggestions?",
  {'cats': {'python': 0.0,
    'other': 0.0,
    'javascript': 0.0,
    'r': 0.0,
    'pandas': 0.0,
    'java': 0.0,
    'c++': 0.0,
    'c#': 0.0,
    'python-3.x': 0.0,
    'sql': 0.0}}),
 ("What's the correct way to use a trait object that contains a method that returns a reference to ?   The following code gives the compiler error Now, we can resolve this in at l

In [13]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']

NameError: name 'nlp' is not defined

In [14]:
from spacy.util import minibatch
from spacy.training import Example

nlp = spacy.load("en_core_web_sm")

if 'textcat_multilabel' not in nlp.pipe_names:
    textcat_multilabel = nlp.create_pipe('textcat_multilabel')
    nlp.add_pipe('textcat_multilabel', last=True)
else:
    textcat_multilabel = nlp.get_pipe('textcat_multilabel')
    
for label in categories.index:
    textcat_multilabel.add_label(label)


# with nlp.disable_pipes(*other_pipes):
with nlp.select_pipes(enable="textcat_multilabel"):

#     optimizer = nlp.begin_training()
#     optimizer = nlp.create_optimizer()
    optimizer = nlp.resume_training()
    for i in range(5):
        print('Epoch %d' % i)
        losses = {}
#         batches = minibatch(data_train_spacy, size=128)
        batches = minibatch(data_train_spacy, size=8)
        for batch in batches:
#             texts, annotations = zip(*batch)
#             nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
#                        losses=losses)
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.2, sgd=optimizer, losses=losses)
#         with textcat_multilabel.model.use_params(optimizer.averages):
#             docs = [nlp.tokenizer(h) for h in data_test.bs4]
#             test_pred = np.array(
#                 [sorted(doc.cats.items(), key=lambda x: -x[1])[0][0]
#                  for doc in textcat_multilabel.pipe(docs)])
#             print('Test Acc: %.4f' %
#                   (pd.Series(test_pred == data_test.category.values).sum() / data_test.shape[0]))

Epoch 0


ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset

In [54]:

for doc in list(textcat_multilabel.pipe(docs)):
    print(doc)


ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset

In [47]:
for doc in textcat.pipe(docs):
    print(doc)

ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset

In [44]:
docs = [nlp.tokenizer(h) for h in data_test.bs4]

In [46]:
textcat.pipe(docs)

<generator at 0x29d60678d68>