Useful stuff
---

In [1]:
class AttributeDict(dict):
    """Like dict but with attribute access and setting"""
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt


# Directory containing all train and test csv
dataset_dir = os.path.join(os.curdir, 'datasets', 'comments')

DEFAULT_COLUMNS = ["sentence", "class", "class_idx"]
dataset = AttributeDict(
    train=pd.DataFrame(columns=DEFAULT_COLUMNS),
    test=pd.DataFrame(columns=DEFAULT_COLUMNS)
)

classes = {
        "Donald Trump" : "Donald-Trump-%s.csv",
        "Joe Biden" : "Joe-Biden-%s.csv"
        }


for typ in dataset.keys(): # train, test
    for class_idx, (class_name, class_path) in enumerate(classes.items()): # Donald Trump, Joe Biden
        df = pd.read_csv(os.path.join(dataset_dir, class_path%typ), index_col=0)
        df["class"] =  class_name # Get the name (Donald Trump or Joe Biden)
        df["class_idx"] = class_idx # 0 -> Donald Trump, 1 -> Joe Biden
        df.columns = DEFAULT_COLUMNS # Force columns name
        dataset[typ] = dataset[typ].append(df, ignore_index=True) # Append train from Donald and Joe together (the same for test)

assert dataset.train.shape[1:] == (3,), "dataframe does not contain 3 columns named %s"%DEFAULT_COLUMNS
assert dataset.test.shape[1:] == (3,), "dataframe does not contain 3 columns named %s"%DEFAULT_COLUMNS

print(f'Number of words in our train dataset : {dataset.train.sentence.apply(lambda x:len(x.split(" "))).sum():.3e}')

Number of words in our train dataset : 7.084e+05


Preprocess all sentences
---

- Retrieve entities in the text using spacy `en_core_web_sm` and merged all words making the entity
- Clean the text using custom filters such as : 
    - lower cases
    - strip non alpha numeric values
    - strip punctuation 
    - strip multiple whitespaces
    - finally taking the lemma of the word

In [3]:
from functools import partial
from typing import Sequence, List
from spacy.tokens.doc import Doc
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags       # strip html tags
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation, strip_non_alphanum
from nltk.stem import WordNetLemmatizer 

lemmatize = WordNetLemmatizer().lemmatize

def tokenize(sentence : str, spacy_model, to_merge_entities:Sequence[str]=["GPE", "LOC", "PERSON"]) -> Doc:
    doc = spacy_model(sentence)

    # Retrieve entities to merge
    ent_to_split = {ent.text:ent.text.split(' ') for ent in doc.ents if ent.label_ in to_merge_entities}

    # Set all entities lemma to the merged name
    for complete_entity, splitted_entity in ent_to_split.items():
        merged_entity = "_".join(splitted_entity)
        sentence.replace(complete_entity, merged_entity)

    CUSTOM_FILTERS = [lambda x: x.lower(),
                      strip_non_alphanum,
                      strip_punctuation,
                      #remove_stopwords,
                      strip_multiple_whitespaces]
    parsed_line = preprocess_string(sentence, CUSTOM_FILTERS)
    parsed_line = [lemmatize(x) for x in parsed_line]

    return parsed_line

def filter_tokens(tokens:Sequence[str], to_avoid:Sequence[str]) -> List[str]:
    returned_tokens = set()
    for token in tokens:
        if len(token)>1 and (token not in to_avoid):
            returned_tokens.add(token)
    return list(returned_tokens)



def sent_preprocess(sentence:str, spacy_model, to_avoid:Sequence[str]) -> List[str]:
    pipe = (partial(tokenize, spacy_model=spacy_model),
            partial(filter_tokens, to_avoid=to_avoid))
    x = sentence
    for f in pipe:
        x = f(x)
    return x


def df_preprocess(df:pd.DataFrame, column:str, inplace:bool=False, spacy_model_name:str="en_core_web_sm") -> pd.DataFrame:
    import spacy
    import string
    from functools import partial

    try:
        nlp = spacy.load(spacy_model_name)
    except OSError as os_error:
        import sys
        import warnings
        import subprocess
        warnings.warn(f"spacy model {spacy_model_name} was not yet installed. Install it now.", ResourceWarning)
        subprocess.check_call([sys.executable, "-m", "spacy", "download", spacy_model_name])
        df_preprocess(df=df, column=column, inplace=inplace, spacy_model_name=spacy_model_name)

    to_avoid = list(string.punctuation+' ') + ['\n', '\t']

    try: # Speedup preprocess with parallelization
        from pandarallel import pandarallel
        pandarallel.initialize(progress_bar=False, verbose=0)
        preprocessed_column = df[column].parallel_apply(partial(sent_preprocess, spacy_model=nlp, to_avoid=to_avoid))
    except AttributeError: # not parallelized
        preprocessed_column = df[column].apply(partial(sent_preprocess, spacy_model=nlp, to_avoid=to_avoid))


    preprocessed_df = df if inplace else df.copy(deep=True)

    preprocessed_df["preprocessed"] = preprocessed_column

    return preprocessed_df

word_set = {
    "Donald Trump" : set(),
    "Joe Biden" : set()
}

def __make_docs(df_row):
    global word_set
    tokens = df_row.preprocessed
    for token in tokens:
        word_set[df_row["class"]].add(token)
    doc = TaggedDocument(words=tokens, tags=tokens)
    return doc

def df_make_doc(df:pd.DataFrame, inplace:bool=False) -> pd.DataFrame:

    preprocessed_column = df.apply(__make_docs, axis=1)

    preprocessed_df = df if inplace else df.copy(deep=True)

    preprocessed_df["doc"] = preprocessed_column


    return preprocessed_df


for typ in ("train", "test"):
    df_preprocess(dataset[typ], "sentence", inplace=True) # Preprocess strings
    df_make_doc(dataset[typ], inplace=True) # Build TaggedDocument from tokenized strings

In [4]:
print(dataset.train.tail(1))

print(list(word_set["Joe Biden"])[:5])

                       sentence      class class_idx  \
22569  Go Joe, TAKE TRUMP DOWN!  Joe Biden         1   

                       preprocessed  \
22569  [joe, trump, go, take, down]   

                                                     doc  
22569  ([joe, trump, go, take, down], [joe, trump, go...  
['2800', 'pompeo', 'binary', 'decimal', 'consultation']


In [5]:
# import re
# import nltk
# from nltk.stem import WordNetLemmatizer
# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=False)
#
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
#
# # Used to know which class (Donald, Joe) each word belongs to
# word_set = {
#     "Donald Trump" : set(),
#     "Joe Biden" : set()
# }
#
# def clean_text(text):
#     text = re.sub(r'\|\|\|', r' ', text)
#     text = text.lower()
#     return text
#
# def tokenize_text(text):
#     tokens = []
#     for sent in nltk.sent_tokenize(text):
#         for word in nltk.word_tokenize(sent):
#             if len(word) < 2:
#                 continue
#             tokens.append(word.lower())
#     return tokens
#
# def make_docs(df_row):
#     tokens = tokenize_text(df_row.sentence)
#     for token in tokens:
#         word_set[df_row["class"]].add(token)
#     doc = TaggedDocument(words=tokens, tags=tokens)
#     return doc
#
# # Apply clean text on both train and test dataset
# for typ in ("train", "test"):
#     dataset[typ].sentence = dataset[typ].sentence.parallel_apply(clean_text)
#
# # Apply tokenize and Tagged Documents for both train and test dataset
# for typ in ("train", "test"):
#     dataset[typ]["doc"] = dataset[typ].apply(make_docs, axis=1)
#

Build the Doc2Vec model
---

In [6]:
from tqdm.notebook import tqdm
from multiprocessing import cpu_count

from gensim.models.doc2vec import Doc2Vec

EMBEDDINGS_DIMS = 100
N_WORKERS = cpu_count()
MIN_COUNT = 2
WINDOW = 2
EPOCHS = 30
MODEL_NAME = f"doc2vec-dims_{EMBEDDINGS_DIMS}-min_count_{MIN_COUNT}-window_{WINDOW}-epochs_{EPOCHS}"

doc2vec_model = Doc2Vec(
    vector_size=EMBEDDINGS_DIMS,
    min_count=MIN_COUNT,
    workers=N_WORKERS,
    epochs=EPOCHS,
    compute_loss=True,
    )

doc2vec_model.build_vocab(dataset.train.doc.values)

In [7]:
from gensim.models.callbacks import CallbackAny2Vec, DiffMetric, ConvergenceMetric

class PlotLogger(CallbackAny2Vec):
    def __init__(self, model):
        self.p = tqdm(total=model.epochs)

    def on_epoch_end(self, model):
        self.p.update(1)
        # self.p.set_postfix_str(f"Loss {model.get_latest_training_loss():.3e}")

doc2vec_model.train(dataset.train.doc.values,
                   total_examples=doc2vec_model.corpus_count,
                   epochs=doc2vec_model.epochs,
                   callbacks=[PlotLogger(doc2vec_model)]
                   )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))

In [8]:
models_path = os.path.join(os.curdir, 'models')
if not os.path.exists(models_path):
    os.mkdir(models_path)

doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
doc2vec_model.save(os.path.join(models_path, MODEL_NAME+".model"))

TypeError: cannot pickle '_thread.lock' object

## Plot the Doc2Vec vectors in 2D using t-SNE

In [None]:
from sklearn.manifold import TSNE

vec_tsne = TSNE(n_components=2).fit_transform(doc2vec_model.wv.vectors)
vocabulary = list(doc2vec_model.wv.vocab)

In [None]:
from multiprocessing import Pool

def get_class_word(word):
    n = 0
    if word in word_set["Donald Trump"]:
        n += 1
    if word in word_set["Joe Biden"]:
        n += 2
    return n

def get_color_word(class_word:int, color_dict=("#000000", "#FF0000", "#0000FF", "#00FF00")):
#def get_color_word(class_word:int, color_dict=("#000000", "red", "blue", "green")):
    return color_dict[class_word]

def get_class_name_word(class_word, classes=("", "Donald Trump", "Joe Biden", "Biden and Trump")):
    return classes[class_word]

with Pool() as pool:
    class_word = pool.map(get_class_word, vocabulary)
    color_word = pool.map(get_color_word, class_word)
    class_name_word = pool.map(get_class_name_word, class_word)

In [None]:
from bokeh.plotting import figure, output_notebook, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.io import curdoc
import numpy as np

plot_dict = AttributeDict(
    x = vec_tsne[:,0],
    y = vec_tsne[:,1],
    fill_color = np.array(color_word),
    legend_label = np.array(class_name_word),
    word = np.array(vocabulary),
    class_name_word = np.array(class_name_word)

)
source = ColumnDataSource(plot_dict)

TITLE = "Plot of Doc2Vec trained vectors in 2D using t-SNE"
TOOLS = "hover,pan,wheel_zoom,box_zoom,tap,reset,save"
p = figure(tools=TOOLS,
           title=TITLE,
           toolbar_location="above",
           plot_width=800,
           plot_height=800)
p.xaxis.axis_label = "t-SNE variable 1"
p.yaxis.axis_label = "t-SNE variable 2"
output_notebook()

curdoc().theme = "light_minimal"
#output_file("doc2vec.html")

# Show info per circle (pan)
p.hover.tooltips = [
    ("word", "@word"),
    ("class","@class_name_word")
]

# display all circles
color_dict=("#000000", "#FF0000", "#0000FF", "#00FF00")

p.circle(x="x",
         y="y",
         radius=.4,
         selection_color="black",
         fill_alpha=.5,
         line_alpha=.6,
         fill_color="fill_color",
         legend_group='class_name_word',
         source=source)

p.legend.location = "top_left"
p.legend.click_policy="hide"
show(p)

# BERT model

In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')

encoded_sentences = embedder.encode(dataset.train.sentence.values)

## Get the t-SNE of the BERT model

In [None]:
vec_tsne_bert = TSNE(n_components=2).fit_transform(encoded_sentences)
vocabulary = dataset.train.sentence.values

In [None]:
plot_dict = AttributeDict(
    x = vec_tsne_bert[:,0],
    y = vec_tsne_bert[:,1],
    sentence = np.array(vocabulary),
    class_name_word = np.array(class_name_word)

)
source = ColumnDataSource(plot_dict)

TITLE = "Plot of BERT trained vectors in 2D using t-SNE"
TOOLS = "hover,pan,wheel_zoom,box_zoom,tap,reset,save"
p = figure(tools=TOOLS,
           title=TITLE,
           toolbar_location="above",
           plot_width=800,
           plot_height=800)
p.xaxis.axis_label = "t-SNE variable 1"
p.yaxis.axis_label = "t-SNE variable 2"

curdoc().theme = "light_minimal"
output_notebook()
#output_file("doc2vec.html")

# Show info per circle (pan)
p.hover.tooltips = [
    ("sentence", "@sentence"),
]

# display all circles
color_dict=("#000000", "#FF0000", "#0000FF", "#00FF00")

p.circle(x="x",
         y="y",
         radius=.4,
         selection_color="black",
         fill_alpha=.5,
         line_alpha=.6,
         source=source)

p.legend.location = "top_left"
p.legend.click_policy="hide"
show(p)

In [None]:
plot_dict = AttributeDict(
    x = vec_tsne_bert[:,0],
    y = vec_tsne_bert[:,1],
    sentence = np.array(vocabulary),
    class_name_word = np.array(class_name_word)

)
source = ColumnDataSource(plot_dict)

TITLE = "Plot of BERT trained vectors in 2D using t-SNE"
TOOLS = "hover,pan,wheel_zoom,box_zoom,tap,reset,save"
p = figure(tools=TOOLS,
           title=TITLE,
           toolbar_location="above",
           plot_width=800,
           plot_height=800)
p.xaxis.axis_label = "t-SNE variable 1"
p.yaxis.axis_label = "t-SNE variable 2"

curdoc().theme = "light_minimal"
output_notebook()
#output_file("doc2vec.html")

# Show info per circle (pan)
p.hover.tooltips = [
    ("sentence", "@sentence"),
]

# display all circles
color_dict=("#000000", "#FF0000", "#0000FF", "#00FF00")

p.circle(x="x",
         y="y",
         radius=.4,
         selection_color="black",
         fill_alpha=.5,
         line_alpha=.6,
         source=source)

p.legend.location = "top_left"
p.legend.click_policy="hide"
show(p)

In [20]:
plot_dict = AttributeDict(
    x = vec_tsne_bert[:,0],
    y = vec_tsne_bert[:,1],
    sentence = np.array(vocabulary),
    class_name_word = np.array(class_name_word)

)
source = ColumnDataSource(plot_dict)

TITLE = "Plot of BERT trained vectors in 2D using t-SNE"
TOOLS = "hover,pan,wheel_zoom,box_zoom,tap,reset,save"
p = figure(tools=TOOLS,
           title=TITLE,
           toolbar_location="above",
           plot_width=800,
           plot_height=800)
p.xaxis.axis_label = "t-SNE variable 1"
p.yaxis.axis_label = "t-SNE variable 2"

curdoc().theme = "light_minimal"
output_notebook()
#output_file("doc2vec.html")

# Show info per circle (pan)
p.hover.tooltips = [
    ("sentence", "@sentence"),
]

# display all circles
color_dict=("#000000", "#FF0000", "#0000FF", "#00FF00")

p.circle(x="x",
         y="y",
         radius=.4,
         selection_color="black",
         fill_alpha=.5,
         line_alpha=.6,
         source=source)

p.legend.location = "top_left"
p.legend.click_policy="hide"
show(p)

[ 5.02741873e-01  5.42350531e-01 -4.54913348e-01  3.62396926e-01
  1.03594136e+00  8.52222681e-01  1.23655945e-01 -7.70760417e-01
  1.26155245e+00 -1.57635972e-01  9.47151780e-02  1.36731720e+00
  4.60520744e-01  8.51923525e-01  6.29401267e-01  4.63805705e-01
 -3.32578868e-02 -1.93109792e-02 -2.08698824e-01  2.22453013e-01
  2.05459315e-02  3.56784552e-01  2.52761096e-01  1.25503314e+00
 -4.37452257e-01  2.11342469e-01  4.83110070e-01  5.16365707e-01
 -6.43678725e-01  2.98054487e-01 -1.55116749e+00 -1.73278544e-02
 -2.15552464e-01 -4.89240438e-01 -1.01642676e-01  3.72244865e-01
  7.41003692e-01 -2.16345596e+00  8.84128571e-01 -5.11813343e-01
 -1.54283655e+00  2.98755020e-01  5.73809087e-01  5.89497626e-01
  8.25649679e-01 -1.51504651e-01 -1.53219506e-01  2.69360900e-01
  7.69193649e-01  6.66561484e-01  1.80424303e-01  9.96109009e-01
 -1.37816638e-01  1.13392308e-01 -6.31846726e-01  8.12728465e-01
 -2.81687289e-01 -4.95755881e-01  4.92992193e-01 -2.54587948e-01
  5.16135812e-01 -3.63849