In [None]:
# Download packages
import pandas as pd
import numpy as np

# Load the revisions as a datafram
file_index = [1, 2, 3, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]

revisions_df = pd.DataFrame()

for i in range(len(file_index)):
    import_file = 'article' + str(file_index[i]) + '_read4article.csv'
    revisions_df = pd.concat([revisions_df, pd.read_csv(import_file)], axis=0)
    
revisions_df.reset_index(inplace = True, drop = True)

import copy 
new_df_combined = copy.deepcopy(revisions_df)

In [None]:
# Convert new_text from a string to a list
from tqdm import tqdm

new_text_string_list = list(new_df_combined.new_text)
new_text_list_list = []

for item in tqdm(new_text_string_list):
    new_text_list_list.append(list(filter((', ').__ne__, item.split("'")[1:-1])))
    
new_df_combined.drop(['new_text', 'diff_text'], axis = 1, inplace = True)

new_df_combined['new_text'] = new_text_list_list

In [None]:
# Drop documents with less than a specified number of words

threshold_new = 150

lengths_new = []
for i in tqdm(new_df_combined.new_text):
    lengths_new.append(len(i))

for i in tqdm(range(len(lengths_new))):
    if lengths_new[i] < threshold_new:
        new_df_combined = new_df_combined.drop([i])

new_df_combined.reset_index(inplace = True, drop = True)
        
print("Number of documents remaining: %d" % new_df_combined.shape[0])

In [None]:
# Create a vocabulary
from collections import Counter

docs_complete_new = list(new_df_combined.new_text)

vocab_new = Counter()
for doc in tqdm(docs_complete_new):
    vocab_new.update(doc)

print("Number of unique tokens: %d" % len(vocab_new))

In [None]:
# Create stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords_import = stopwords.words('english')
    
stopwords_additional = ['ref', 'url', 'n', 'br', 'user', 'talk', 'color', 'style', \
                       'wikipedia', 'web', 'date', 'utc', 'align', 'name', 'cite', \
                       'date', 'c', 'title', 'archive', 'flagicon', 'links', 'order' \
                       'center', 'sort', 'label', 'cvt', 'abbr', 'symbol', 'publisher', \
                       'category', 'convert', 'style', 'width', 'accessdate', 'nbsp', \
                       'language', 'km', 'row', 'nthe', 'access', 'website', 'x', 'infobox', \
                       'wikiproject', 'image', 'nimage', 'short', 'description', 'class', \
                       'character', 't', 'ts', 'u', 'ns', 'g', 'lat', 'fb', 'bul', 'gk', \
                       'update', 'j', 'p', 'fs', 'q', 'link', 'file', 'svg', 'list', \
                       'fig', 'pog', 'df', 'altname', 'piccap', 'use', 'mdy', 'expand', \
                       'date', 'first', 'last', 'work', 'fact', 'check', 'background', \
                       'language', 'aus', 'rus', 'chn', 'cze', 'fra', 'ger', 'ita', \
                       'de', 'also', 'one', 'bgcolor', 'year', 'two', 'time', 'would', \
                       'new', 'many', 'text', 'sup', 'pos', 'nat', 'req', 'sent', 'go', \
                       'f', 'rowspan', 'jpg', 'w', 'r', 'ndash', 'cfcfff', 'dfffdf', \
                       'hex', 'efcfff', 'none', 'und', 'ii', 'including', 'since', 'non', \
                       'valign', 'id', 'colspan', 'font', 'mf', 'au', 'used', 'wpships', \
                       'wpmilhist', 'infobox', 'dcecfc', 'like', 'we', 'your', 'ii', 'did', \
                       'should', 'very', 'td', 'those', 'another', 'does']

In [None]:
# Trim words that occur too frequently or too rarely

vocab_new = Counter(token for token in tqdm(vocab_new.elements()) if vocab_new[token] > 150)
vocab_new = Counter(token for token in tqdm(vocab_new.elements()) if token not in stopwords_import)
vocab_new = Counter(token for token in tqdm(vocab_new.elements()) if token not in stopwords_additional)

# Update the documents
docs_filtered_new = [[token for token in doc if token in vocab_new] for doc in docs_complete_new]

print("Number of unique tokens: %d" % len(vocab_new))

In [None]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.
from gensim.corpora import Dictionary

dictionary_new = Dictionary(docs_filtered_new)
_ = dictionary_new[0]  # This sort of "initializes" dictionary.id2token.

# Vectorize data / Bag-of-words representation of the documents.
corpus_new = [dictionary_new.doc2bow(doc) for doc in docs_filtered_new]

In [None]:
# Create author2doc dictionaries
from tqdm import tqdm

author2doc_country = dict()
author2doc_org = dict()

for index, row in tqdm(new_df_combined.iterrows()):
    
    country = row['country']
    org = row['org']
    
    # This is a new author.
    if not author2doc_country.get(country):
        author2doc_country[country] = []
    if not author2doc_org.get(org):
        author2doc_org[org] = []
    
    # Add document IDs to author.
    author2doc_country[country].extend([index])
    author2doc_org[org].extend([index])

In [None]:
# Save dictionary, author2doc dictionaries, and updated documents
dictionary_new.save('dictionary_article')

np.save('author2doc_article.npy', author2doc_country) 

import pickle
pickle.dump(docs_filtered_new, open("docs_filtered_article.txt", 'wb'))

### Set number of passes and iterations to ensure convergence (performed on small sample only)

In [None]:
# Train the model
import logging
import os
from gensim.models import AuthorTopicModel

author2doc = author2doc_country

topic_num = 10
passes = 50
iterations = [5, 10, 15, 20, 25, 30]

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(filename='gensim_new.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

for iteration in iterations:
    %time model = AuthorTopicModel(corpus=corpus_new, id2word=dictionary_new.id2token, author2doc=author2doc, \
                                   num_topics=topic_num, \
                                   chunksize=5000, passes=passes, eval_every=1, iterations=iteration, \
                                   random_state=0)

In [None]:
# Plot the likelihood to check for convergence

p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity")
matches = [p.findall(l) for l in open('gensim_new.log')]
matches = [m for m in matches if len(m) > 0]
tuples = [t[0] for t in matches]
perplexity = [float(t[1]) for t in tuples]
likelihood = [float(t[0]) for t in tuples]
iter = list(range(0,passes))

plt.plot(iter,np.array(likelihood).reshape(len(iterations), int(len(likelihood)/len(iterations))).T)
plt.ylabel("Log Likelihood")
plt.xlabel("Pass")
plt.title("Topic Model Convergence")
plt.legend(iterations);

### Train the model with optimal topic number

In [None]:
# Train the model
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import AuthorTopicModel

author2doc = author2doc_country

num_topics = [10, 15, 20, 25, 30]
passes = 10
iterations = 5

model_list_new = []

for topic_num in tqdm(num_topics):
    %time model = AuthorTopicModel(corpus=corpus_new, \
                                   id2word=dictionary_new.id2token, \
                                   author2doc=author2doc, \
                                   num_topics=topic_num, \
                                   chunksize=5000, \
                                   passes=passes, \
                                   eval_every=1, \
                                   iterations=iterations, \
                                   random_state=0)
    
    model_coherence = CoherenceModel(model = model, texts = docs_filtered_new, dictionary = dictionary_new, coherence='c_v')
    model_list_new.append((model, model_coherence.get_coherence()))

model_selected_new = max(model_list_new, key=lambda x: x[1])[0]
model_selected_new.save('new_model')
topic_num_selected_new = num_topics[np.argmax([x[1] for x in model_list_new])]

In [None]:
# Plot the topic coherence
import matplotlib.pyplot as plt
coherence = [float(t[1]) for t in model_list_new]

plt.style.use('seaborn-whitegrid')
plt.plot(num_topics, coherence)
plt.ylabel("Topic Coherence")
plt.xlabel("Number of Topics")
plt.title("Optimal Topic Selection");
plt.savefig('topic_coherence_article.pdf')

### Visualize the Results

In [None]:
# Visualize the top words in each topic
num_words = 50

top_words_new = pd.DataFrame({'word rank': np.arange(1,num_words+1)})
for k in np.arange(topic_num): #_selected_new
    topic = model_selected_new.get_topic_terms(k, num_words)
    words = [dictionary_new.id2token[topic[i][0]] for i in np.arange(num_words)]
    probs = [topic[i][1] for i in np.arange(num_words)]
    top_words_new['topic %d' % k] = words

top_words_new.set_index('word rank', inplace = True)

# Display the results
top_words_new

In [None]:
# Visualize the distribution of authors over topics
import seaborn as sns
import matplotlib.pyplot as plt

author_topic_df_new = pd.DataFrame(columns=list(top_words_new.columns), index = list(author2doc.keys()))

for author in list(author2doc.keys()):
    temp_df = pd.DataFrame(model_selected_new[author], columns = ['topic_num', 'prob'])
    for i in range(topic_num): #_selected_new
        try: 
            author_topic_df_new[author_topic_df_new.columns[i]][author] = temp_df.loc[i, 'prob']
        except:
            author_topic_df_new[author_topic_df_new.columns[i]][author] = 0

author_topic_df_new = author_topic_df_new.astype(float)

fig, ax = plt.subplots(figsize= (10, 7.5))
sns.heatmap(author_topic_df_new, cmap = 'Blues');
plt.savefig('heatmap_article.pdf')

In [None]:
# Plot interactive tsne
%time
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model_selected_new.author2id[a] for a in model_selected_new.author2id.keys() if len(model_selected_new.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model_selected_new.state.gamma[authors, :])  # Result stored in tsne.embedding_

# Tell Bokeh to display plots inside the notebook.
from bokeh.io import output_notebook
output_notebook()

from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model_selected_new.id2author[a] for a in authors]

# Radius of each point corresponds to the number of documents attributed to that author.
scale = 0.1
author_sizes = [len(model_selected_new.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

# Add author names and sizes to mouse-over info.
hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [None]:
# Plot static tsne
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(5,5))
plt.scatter(x, y, s=[x*5.5 for x in radii], alpha = 0.6)
for i in range(len(x)):
    plt.annotate(author_names[i], (x[i], y[i]), ha='center')
plt.savefig('tsne_static_article.pdf')