## Build topic model

In [1]:
import glob
import random
RND = 12345
random.seed(RND)

import numpy as np
import pandas as pd

In [2]:
n_features = 5000 # number of most common words
n_topics = 150 # number of topics
n_top_words = 60 # number of characteristic words per topic (viz purposes only)
max_df = 0.5 # maximum document frequency
min_df = 100 # minimum document frequency
max_books = 1000 # include all texts
chunk_size = 300

In [3]:
class Chunker(object):
    def __init__(self, filenames, max_chars_per_book=None,
                 chunk_size=None):
        self.max_chars_per_book = max_chars_per_book
        self.chunk_size = chunk_size
        self.filenames = filenames

    def __iter__(self):
        for filename in self.filenames:
            with open(filename) as f:
                text = f.read()
                if self.max_chars_per_book:
                    text = text[:self.max_chars_per_book]
            
            tokens = text.split()
            
            for i in range(0, len(tokens), self.chunk_size):
                yield tokens[i:i + self.chunk_size]

In [4]:
path = 'plain_text_train_small/*.txt'
filenames = sorted(list(glob.glob(path)))
print(len(filenames))

105180


In [5]:
chunker = Chunker(filenames, chunk_size=chunk_size)

In [6]:
import os
from sklearn.feature_extraction.text import CountVectorizer

def identity(x):
    return x

vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                             max_features=n_features,
                             analyzer=identity)
X = vectorizer.fit_transform(chunker)

bow_example = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
bow_example.sample(10)

Unnamed: 0,abandon,abandoned,abdomen,abilities,ability,able,absence,absent,absolute,absorbed,...,yelp,yelped,yesterday,young,younger,youngest,youth,zombie,zombies,zone
160679,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22403,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
395306,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
414557,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
344365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
140782,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
514590,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
37266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
167624,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
376694,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_ = transformer.fit_transform(X)

In [8]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=n_topics,
          random_state=RND,
          verbose=1, max_iter=50).fit(X_)

violation: 1.0
violation: 0.3360977780907724
violation: 0.4229664148445934
violation: 0.39871701680488925
violation: 0.3407255138511432
violation: 0.28476304037449524
violation: 0.2195604364746167
violation: 0.1695373436435552
violation: 0.13577783168115398
violation: 0.10879467049525843
violation: 0.08654912732903063
violation: 0.06991817990665329
violation: 0.05893121278694701
violation: 0.051998089623780386
violation: 0.04705519634224244
violation: 0.04333611242173589
violation: 0.04035589173499745
violation: 0.037171745286061826
violation: 0.033553526636157016
violation: 0.03026251548367681
violation: 0.02747553281120865
violation: 0.02483413088896898
violation: 0.022300420972972208
violation: 0.0202851619900907
violation: 0.018765237437487616
violation: 0.017545637588828933
violation: 0.016526853162000053
violation: 0.01561370055924045
violation: 0.014786751094151635
violation: 0.013949176903767532
violation: 0.013099993423336023
violation: 0.012183611464906388
violation: 0.011329

In [9]:
import os
import shutil
from wordcloud import WordCloud
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

def top_words(model, feature_names, n_top_words):
    try:
        shutil.rmtree('clouds')
    except:
        pass
    os.mkdir('clouds')

    for topic_idx, topic in enumerate(model.components_):
        print('.', end='')
        topic[np.isnan(topic)] = 0
        
        words = [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
        weights = [topic[i] for i in topic.argsort()[:-n_top_words-1:-1]]

        freqs = {wo: we for wo, we in zip(words, weights)}
        wordcloud = WordCloud(normalize_plurals=False,
                              background_color='white',
                              colormap='inferno_r',
                              width=800,
                              height=400)
        wordcloud = wordcloud.generate_from_frequencies(freqs)
        wordcloud.to_file('clouds/'+str(topic_idx) + '.tiff')

feature_names = vectorizer.get_feature_names()
info = top_words(nmf, feature_names, n_topics)


Bad key "text.kerning_factor" on line 4 in
/Users/mikekestemont/anaconda3/envs/n36/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


......................................................................................................................................................

In [17]:
chunks = Chunker(filenames, chunk_size=chunk_size, max_chars_per_book=None)
X = vectorizer.transform(chunks)

In [18]:
X = X.toarray()
X = X[X.sum(axis=1) > 0, :] # rm empty documents
X = np.matrix(X)

In [19]:
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.sklearn

In [20]:
nmf.verbose = 0
pyLDAvis.sklearn.prepare(nmf, X, vectorizer)

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)


## Calculate topical differences

In [65]:
def preprocess_text(text):
    tokens = []
    for t in nlp(text):
        if t.pos_ in allowed and t.is_alpha and not t.is_stop:
            w = t.text.lower()
            if len(w) > 1:
                tokens.append(w)
    return tokens

In [66]:
import json
from sklearn.preprocessing import normalize
from scipy.spatial.distance import cosine
import spacy

allowed = set('ADJ NOUN VERB'.split())
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp.max_length = 10000000



In [67]:
nmf.verbose = 0

In [68]:
from tqdm import tqdm

diffs = {}

with open('datasets/pan20-authorship-verification-test/pairs.jsonl') as inf:
    for line_idx, line in tqdm(enumerate(inf)):
        pair = json.loads(line)
        pair_id = pair['id']
        texts = []
        
        for idx, text in enumerate(pair['pair']):
            texts.append(preprocess_text(text))
        
        if len(texts) == 2:
            X_ = vectorizer.transform(texts).toarray()
            X_ = transformer.transform(X_)
            X_ = nmf.transform(X_)
            X_ = normalize(X_)

            diffs[pair_id] = cosine(X_[0], X_[1])
        else:
            diffs[pair_id] = np.nan

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
14311it [51:45,  4.61it/s]


In [69]:
df_pred = pd.read_excel('predictions.xlsx')
df_pred['id'] = df_pred['id'].astype(str)
df_pred.head()

Unnamed: 0.1,Unnamed: 0,id,same,araujo20-large,araujo20-small,boenninghoff20-large,boenninghoff20-small,faber20-small,gagala20-small,halvani20b-small,ikae20-small,kipnis20-small,niven20-small,ordonez20-large,weerasinghe20-large,weerasinghe20-small
0,0,c04fdf1e-ddf5-5542-96e7-13ce18cae176,1,0.959482,0.999483,0.998031,0.9935224,0.2333,1,0.61,0.77837,0.952,0.743432,0.995946,1.0,1.0
1,1,49dc4cae-3d32-5b4d-b240-a080a1dbb659,0,0.207092,0.75147,0.164877,0.5,0.2833,0,0.473,0.689979,0.194,0.256587,0.988152,0.004039,0.566177
2,2,f326fe7c-fc10-566f-a70f-0f36e3f92399,0,0.208751,0.819039,1e-06,1.582263e-08,0.4333,0,0.443,0.68872,0.382,0.256587,1.0,0.003807,0.003335
3,3,16daa0d1-61b8-5650-b7ee-5e265bd40910,1,0.995293,0.979941,0.716923,0.03127071,0.2167,1,0.526,0.728918,0.68,0.743432,1.0,1.0,0.995425
4,4,08b536a8-4fed-5f62-97bb-e57f79e841d2,0,0.970113,0.679099,0.5,0.8246948,0.4667,0,0.413,0.649772,0.164,0.256587,0.99999,0.081437,5e-05


In [70]:
ranked_diffs = []
for i in df_pred['id']:
    try:
        ranked_diffs.append(diffs[i])
    except KeyError:
        ranked_diffs.append(np.nan)
df_pred['topic-diff'] = ranked_diffs

In [71]:
df_pred.head()

Unnamed: 0.1,Unnamed: 0,id,same,araujo20-large,araujo20-small,boenninghoff20-large,boenninghoff20-small,faber20-small,gagala20-small,halvani20b-small,ikae20-small,kipnis20-small,niven20-small,ordonez20-large,weerasinghe20-large,weerasinghe20-small,topic-diff
0,0,c04fdf1e-ddf5-5542-96e7-13ce18cae176,1,0.959482,0.999483,0.998031,0.9935224,0.2333,1,0.61,0.77837,0.952,0.743432,0.995946,1.0,1.0,0.542654
1,1,49dc4cae-3d32-5b4d-b240-a080a1dbb659,0,0.207092,0.75147,0.164877,0.5,0.2833,0,0.473,0.689979,0.194,0.256587,0.988152,0.004039,0.566177,0.587474
2,2,f326fe7c-fc10-566f-a70f-0f36e3f92399,0,0.208751,0.819039,1e-06,1.582263e-08,0.4333,0,0.443,0.68872,0.382,0.256587,1.0,0.003807,0.003335,0.581967
3,3,16daa0d1-61b8-5650-b7ee-5e265bd40910,1,0.995293,0.979941,0.716923,0.03127071,0.2167,1,0.526,0.728918,0.68,0.743432,1.0,1.0,0.995425,0.758196
4,4,08b536a8-4fed-5f62-97bb-e57f79e841d2,0,0.970113,0.679099,0.5,0.8246948,0.4667,0,0.413,0.649772,0.164,0.256587,0.99999,0.081437,5e-05,0.644368


In [72]:
df_pred.to_excel('predictions_topic.xlsx')