In [3]:
import gensim
import pandas as pd
import spacy

nlp = spacy.load('en')

In [4]:
stormfront_db = 'sqlite:////home/sam/data/stormfront/stormfront.sqlite'

In [5]:
df_text = pd.read_sql_query('''SELECT p2.thread_url AS thread_url, GROUP_CONCAT(p2.text, " ") AS text FROM 
        (SELECT s.*, p.thread_url
            FROM (SELECT subposts.post_id, GROUP_CONCAT(subposts.content, " ") AS text FROM subposts GROUP BY post_id) AS s
            INNER JOIN (SELECT thread_url, post_id FROM posts) AS p ON s.post_id = p.post_id)
        AS p2 GROUP BY p2.thread_url;''', stormfront_db)

In [6]:
df_text.head()

Unnamed: 0,thread_url,text
0,https://www.stormfront.org/forum/t1000001,\n\nAttacks 83 yr old man - YouTube\n\n\n\n\n\...
1,https://www.stormfront.org/forum/t1000015,\nSource: 'It's making children cry': Police t...
2,https://www.stormfront.org/forum/t1000028,MSNBC?s Matthews: Tea Party Racist by Using W...
3,https://www.stormfront.org/forum/t1000034,\n \r\n\t\t\t\r\n\t\t\tMay we still refer to t...
4,https://www.stormfront.org/forum/t1000039,"\n \nWe have become the animal farm, and some ..."


In [7]:
df_authors = pd.read_sql_query('''
    SELECT DISTINCT author, thread_url FROM posts
''', stormfront_db)

In [8]:
from collections import defaultdict
authors_to_threads = defaultdict(list)

for i, row in df_authors.iterrows():
    authors_to_threads[row.author].append(row.thread_url)

In [9]:
len(authors_to_threads), authors_to_threads['Dave Yorkshire'] 

(15448,
 ['https://www.stormfront.org/forum/t988671',
  'https://www.stormfront.org/forum/t1037525',
  'https://www.stormfront.org/forum/t1000366',
  'https://www.stormfront.org/forum/t988762',
  'https://www.stormfront.org/forum/t988089',
  'https://www.stormfront.org/forum/t899416',
  'https://www.stormfront.org/forum/t894550'])

In [10]:
authors_to_ix = defaultdict(list)

for author, threads in authors_to_threads.items():
    for thread in threads:
        ix = df_text[df_text.thread_url == thread].index.item()
        authors_to_ix[author].append(ix)

In [11]:
import sys

processed_text = []
i = 0
for doc in nlp.pipe(df_text.text, n_threads=8, batch_size=100):
    if i % 1000 == 0:
        sys.stdout.write('.')
        sys.stdout.flush()
    
    ents = doc.ents
    
    t = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    t.extend([str(entity) for entity in ents if len(entity) > 1])
    
    processed_text.append(t)
    i += 1

...................................................................

In [12]:
from gensim.corpora import Dictionary
dictionary = Dictionary(processed_text)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 30
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

_ = dictionary[0]  # This sort of "initializes" dictionary.id2token.

In [13]:
corpus = [dictionary.doc2bow(doc) for doc in processed_text]

In [14]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 23383
Number of documents: 66147


In [15]:
from gensim.models import AuthorTopicModel

In [16]:
model = AuthorTopicModel(corpus=corpus, num_topics=20, id2word=dictionary.id2token, \
                author2doc=authors_to_ix, chunksize=100, passes=100, gamma_threshold=1e-10, \
                eval_every=0, iterations=1, random_state=i)


MemoryError: 