## Get Data

In [1]:
import glob

list_of_files = glob.glob('/home/rakvat/mydev/nlp_experiments/scrape_anarchist_library/en/*.txt') 
documents = {}
for filename in list_of_files:
    content = []
    f = open(filename, 'r')
    for line in f:
        content.append(line)
    f.close()
    documents[filename.split('/')[-1]] = content

## Preprocess

In [2]:
import pandas as pd
import re # regex
import nltk
from nltk.corpus import stopwords

data = []
for key, value in documents.items():
    document = {}
    first_line_without_meta = 0
    for i, line in enumerate(value[:15]):
        if line.startswith('#author'):
            document['author'] = line[8:-1]
        if line.startswith('#title'):
            document['title'] = line[7:-1]
        if line.startswith('#date'):
            match = re.search('(\d\d\d\d)', line[6:-1])
            if match:
                document['year'] = match.group(0)
        if first_line_without_meta == 0 and line[0] != '#':
            first_line_without_meta = i
    document['text'] = ''.join(value[first_line_without_meta:])
    document['text_len'] = len(document['text'])
    document['file'] = key
    
    data.append(document)
    
df = pd.DataFrame(data) 
df.year = df.year.fillna(-1).astype(int)
df.author = df.author.fillna('unknown')

default_stopwords = set(stopwords.words('english'))
default_stopwords = default_stopwords.union(
    ['em', '/em', '/quote', '*****', '****', '***', '**', "''", '``', 'br', '/strong', "\'s", "n\'t", '--', 
     'p.'])

texts = [t for t in map(lambda x: x['text'], data)]

document_words_lists = []
wnl = nltk.WordNetLemmatizer()
for text in texts:
    words = nltk.word_tokenize(text)
    words = [word for word in words if len(word) > 1]
    words = [word.lower() for word in words]
    words = [wnl.lemmatize(word) for word in words]
    words = [word for word in words if word not in default_stopwords]
    document_words_lists.append(words)

document_words_lists[0][0:20]

['paradox',
 'guerrillaist',
 'practice',
 'developed',
 'outside',
 'civil',
 'war',
 'lie',
 'following',
 'restorative',
 'justice',
 'sustains',
 'socialist',
 'idea',
 'substituted',
 'contrary',
 'violent',
 'idea',
 'punitive',
 'justice']

## Create Vector Space

In [3]:
from gensim import corpora
dictionary = corpora.Dictionary(document_words_lists)
dictionary.save('../data/alib_dict.dict')

In [4]:
print(dictionary) 

Dictionary(239758 unique tokens: ["'les", '-1,860', '-based', '//socio-logos.revues.org/document352.html', '//www.mondialisme.org/article.php3']...)


In [5]:
print(dictionary.token2id['anarchy'])

3256


In [6]:
new_doc = "economy anarchy computer anarchy"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(3000, 1), (3256, 2), (5190, 1)]


## Create Corpus

In [8]:
corpus = [dictionary.doc2bow(words_list) for words_list in document_words_lists]
corpora.MmCorpus.serialize('../data/alib_corpus.mm', corpus)  # store to disk, for later use

## TfIdf

In [9]:
from gensim import models, similarities
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
print(tfidf[new_vec])

[(3000, 0.29810589529211245), (3256, 0.713676513851372), (5190, 0.6338759411501911)]


## LSI/LSA

In [10]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
lsi.print_topics(200)

[(0,
  '0.119*"anarchist" + 0.096*"worker" + 0.084*"anarchism" + 0.076*"class" + 0.071*"revolution" + 0.068*"revolutionary" + 0.066*"struggle" + 0.065*"organisation" + 0.062*"movement" + 0.061*"government"'),
 (1,
  '-0.228*"worker" + -0.186*"organisation" + -0.149*"bolshevik" + -0.147*"labour" + -0.140*"union" + 0.139*"civilization" + 0.128*"stirner" + -0.118*"revolution" + -0.112*"class" + -0.109*"marx"'),
 (2,
  '0.296*"stirner" + 0.151*"man" + 0.109*"liberty" + 0.104*"men" + 0.102*"god" + -0.090*"technology" + -0.082*"anarchist" + 0.082*"shall" + -0.082*"black" + -0.081*"white"'),
 (3,
  '-0.461*"stirner" + -0.186*"anarchism" + -0.118*"marx" + -0.107*"anarchist" + -0.097*"bookchin" + -0.093*"egoist" + 0.086*"prison" + 0.084*"police" + -0.076*"ego" + -0.074*"technology"'),
 (4,
  '-0.429*"stirner" + -0.195*"anarchist" + 0.149*"technology" + 0.147*"civilization" + 0.142*"labour" + 0.127*"production" + -0.122*"anarchism" + 0.106*"capital" + -0.102*"prison" + -0.100*"black"'),
 (5,
  '

In [11]:
lsi.save('../data/model.lsi')

In [12]:
print(lsi[new_vec])

[(0, 0.12258860572596827), (1, 0.006418555235643018), (2, -0.05088103002610632), (3, -0.10531060445178629), (4, 0.05213420347270397), (5, 0.01397995935283037), (6, -0.0655598078156181), (7, -0.08869351258284379), (8, 0.14393838039257592), (9, 0.03619414565842027), (10, 0.0401445814644127), (11, 0.015950678741725732), (12, -0.18723871028486117), (13, -0.009165715644708089), (14, -0.0792471089233851), (15, -0.013980568764023357), (16, -0.05705631144608747), (17, 0.05392243140360174), (18, 0.005725411378605903), (19, -0.011552336842428741), (20, 0.049453036601297455), (21, -0.03372482197832291), (22, 0.0182030595328843), (23, 0.0924095698298659), (24, -0.059381989420964704), (25, 0.07228946446263088), (26, 0.03435399519574828), (27, 0.01076339255754406), (28, 0.001085260071856011), (29, -0.10038262935175554), (30, 0.08869828468651333), (31, -0.08881232347535332), (32, -0.01547901510150334), (33, -0.06584892329624648), (34, 0.10759576932312587), (35, -0.051488279012210814), (36, 0.03580847

In [13]:
print(lsi[dictionary.doc2bow(["war"])])
print(lsi[dictionary.doc2bow(["peace"])])

[(0, 0.05040875868912965), (1, -0.014669940991923633), (2, -0.0057682863806299035), (3, 0.05160009307917291), (4, -0.026348976177985328), (5, -0.03859175603173524), (6, -0.02017965962594499), (7, 0.0013769456971408254), (8, -0.017495832047868677), (9, -0.0941933236781452), (10, 0.02919500319189675), (11, -0.10983020477395354), (12, 0.033823938525103965), (13, 0.015604006229404473), (14, -0.04717359122440675), (15, 0.020695236735444465), (16, -0.0032060766030042654), (17, 0.026272876031066008), (18, -0.07266857889008417), (19, 0.03370502879931733), (20, 0.10403703868680715), (21, -0.025366001969970176), (22, 0.00979721067119518), (23, -0.0020193573751470613), (24, 0.03286857844495032), (25, -0.03722655768417492), (26, -0.007551339551337631), (27, 0.07737235464758238), (28, 0.07491509738059961), (29, -0.012926436738222833), (30, 0.03709277163810119), (31, 0.045078251021347146), (32, -0.02627427234107535), (33, 0.027626877254516485), (34, 0.0013662622590916453), (35, 0.048664068376852485)

## Similarities

In [14]:
index = similarities.MatrixSimilarity(lsi[corpus])
index.save('../data/a_lib_similarity.index')

In [15]:
new_doc = 'economic planning'
new_vec = dictionary.doc2bow(new_doc.lower().split())
new_lsi_vec = lsi[new_vec]
sims = index[new_lsi_vec]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for sim in sims:
    print(sim[1], sim[0], data[sim[0]]['file'])

0.5213411 2694 bobby-whittenberg-james-economic-nihilism.muse.txt
0.46798813 1903 bob-maier-imperialism-political-economic-social-consequences-by-bob-maier-black-red-number-1-se.muse.txt
0.44537455 2202 kostas-gournas-nikos-maziotis-pola-roupa-political-letter-to-society.muse.txt
0.43067807 827 diego-abad-de-santillan-after-the-revolution.muse.txt
0.42067364 1765 alexis-passadakis-and-matthias-schmelzer-12-lines-of-flight-for-just-degrowth.muse.txt
0.41868895 1220 kevin-carson-austrian-and-marxist-theories-of-monopoly-capital.muse.txt
0.41420552 621 economics-of-freedom.muse.txt
0.4135535 199 anarcho-anarchist-economics.muse.txt
0.4111468 8 wayne-price-workers-self-directed-enterprises-a-revolutionary-program.muse.txt
0.390404 2678 murray-bookchin-state-capitalism-in-russia.muse.txt
0.38792247 348 grigori-petrovitch-maximov-my-social-credo.muse.txt
0.38554245 2451 grigori-petrovitch-maximov-programme-of-anarcho-syndicalism.muse.txt
0.37226832 805 rudolf-rocker-the-reproduction-of-daily

0.14787866 3025 mitchell-cowen-verter-undoing-patriarchy-subverting-politics-anarchism-as-a-practice-of-care.muse.txt
0.14786601 2584 george-barrett-the-anarchist-revolution.muse.txt
0.14784566 1915 larry-law-revolutionary-self-theory.muse.txt
0.1478144 1173 various-authors-three-essays-on-the-new-mandarins.muse.txt
0.14775729 1421 paul-avrich-what-is-makhaevism.muse.txt
0.14765331 1564 various-authors-illuminating-discord-an-interview-with-robert-anton-wilson.muse.txt
0.147632 822 raoul-vaneigem-basic-banalities.muse.txt
0.14760408 984 anarchist-federation-of-rio-de-janeiro-social-anarchism-and-organisation.muse.txt
0.14752007 2082 lorenzo-kom-boa-ervin-authoritarian-leftists.muse.txt
0.14740719 2664 el-libertario-editorial-collective-chevron-and-the-bolivarian-government.muse.txt
0.14740123 1271 lupus-dragonowl-the-future-of-insurrection.muse.txt
0.14737263 858 various-authors-willful-disobedience-volume-3-number-2.muse.txt
0.14729616 1799 walker-lane-the-empire-exits-iraq.muse.txt
0

0.07160898 2446 william-morris-signs-of-change.muse.txt
0.07158722 2874 alan-gilbert-anarchism-in-indonesia.muse.txt
0.07153475 825 mare-almani-the-ferocious-jaws-of-habit.muse.txt
0.071458414 2180 robert-anton-wilson-don-t-be-afraid-of-black-magick.muse.txt
0.07135646 1941 wendy-mcelroy-the-schism-between-individualist-and-communist-anarchism.muse.txt
0.07133885 699 paul-z-simons-john-brown-s-body.muse.txt
0.071306154 381 humanaesfera-against-the-strategy.muse.txt
0.071285784 1437 kuwasi-balagoon-letters-from-prison.muse.txt
0.071232185 947 aragorn-locating-an-indigenous-anarchism.muse.txt
0.071195036 1683 oscar-wilde-the-soul-of-man-under-socialism.muse.txt
0.07114654 1137 manolo-gonzalez-life-in-revolutionary-barcelona.muse.txt
0.07108256 505 max-cafard-the-surre-gion-alist-manifesto-and-other-writings.muse.txt
0.07097237 1061 chuck-munson-the-criminalization-of-women.muse.txt
0.07090442 1429 georges-bataille-the-sacred-conspiracy.muse.txt
0.070897155 2930 john-zerzan-second-best-li

In [27]:
favorite_docs = ['diego-abad-de-santillan-after-the-revolution.muse.txt', 'ilan-shalif-glimpses-into-the-year-2100-50-years-after-the-revoution.muse.txt', 'paul-buckermann-on-socialist-cybernetics.muse.txt']
for file in favorite_docs:
    doc = df[df['file'] == file]
    vec = dictionary.doc2bow(document_words_lists[doc.index[0]])
    lsi_vec = lsi[vec]
    print('--- similar to {}:'.format(file))
    sims = index[lsi_vec]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])[1:6]
    for sim in sims:
        print(sim[1], sim[0], data[sim[0]]['file'])

--- similar to diego-abad-de-santillan-after-the-revolution.muse.txt:
0.90812916 655 gaston-leval-libertarian-socialism-a-practical-outline.muse.txt
0.830497 1758 isaac-puente-libertarian-communism.muse.txt
0.79732645 2451 grigori-petrovitch-maximov-programme-of-anarcho-syndicalism.muse.txt
0.79112744 2478 james-guillaume-ideas-on-social-organization.muse.txt
0.78977716 1581 petr-kropotkin-the-conquest-of-bread.muse.txt
--- similar to ilan-shalif-glimpses-into-the-year-2100-50-years-after-the-revoution.muse.txt:
0.70028174 2841 augustin-souchy-with-the-peasants-of-aragon.muse.txt
0.6888959 2776 john-severino-the-other-gods-were-crying.muse.txt
0.6797942 1479 meir-turniansky-kibbutz-samar.muse.txt
0.6785073 383 various-authors-que-se-vayan-todos-out-with-them-all-argentina-s-popular-rebellion.muse.txt
0.66656536 2399 peter-gelderloos-anarchy-works.muse.txt
--- similar to paul-buckermann-on-socialist-cybernetics.muse.txt:
0.7834537 2999 david-graeber-of-flying-cars-and-the-declining-rate