# Creating a term-term co-occurence matrix and word vectors
## Using the german wikipedia as source

Extract the useful data from the wiki dumb using wikiextractor:

https://github.com/attardi/wikiextractor

```bash
python WikiExtractor.py path/to/dump --json 
```

WikiExtractor splits the dump into various folder with 100 files each. This allows easy sequential reading regarding the amount of data.

Using --json is just a personal preference because parsing json is – imo – far easier than XML.

In [12]:
import os
import ijson
import json
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.corpus import stopwords
import pickle 

In [13]:
window_size = 2
path_to_corpus = "/home/mn/MEGA/Master Digital Humanities/WS1819/Word Embedding/wiki_corpus"

#### Read previously generated file cointaining the 10k most frequent words (excluding stop words) in the corpus

Created by tokenizing and using Collections.counter

In [14]:
with open("/home/mn/MEGA/Master Digital Humanities/WS1819/Word Embedding/most_frequent_words.pkl", "rb") as p:
    mfw = pickle.load(p)

#### Create "pseudo" sparse matrix by creating nested defaultdict (initialized with 0)
defaultdicts generate "empty" values on the fly so theses indices don't need memory/space unless requested

In [28]:
sparse_matrix = defaultdict(lambda: defaultdict(lambda: 0))

In [29]:
tokenizer = RegexpTokenizer(r'\w+')
stopWords = set(stopwords.words('german'))

In [38]:
for root, dirs, files in os.walk(path_to_corpus):
    for file_ in files:
        with open(os.path.join(root, file_), 'r') as f:
            data = "[" + f.read()[:-2] + "]"
            json_data = json.loads(data)
            for row in json_data:
                for sentence in sent_tokenize(row["text"]):
                    tokens = tokenizer.tokenize(sentence)
                    for token in tokens:
                        if token in mfw:
                            #word window
                            for i in [x for x in range(-window_size, window_size + 1) if x != 0]:
                                if tokens.index(token) + i >= 0:
                                    try:
                                        if tokens[tokens.index(token) + i] not in stopWords:
                                            sparse_matrix[token][tokens[tokens.index(token) + i]] += 1
                                    except IndexError:
                                        pass
                                    continue

In [42]:
print(sparse_matrix["Amerika"])

defaultdict(<function <lambda>.<locals>.<lambda> at 0x7f68afabc488>, {'1939': 4, 'nach': 88, 'zurück': 10, 'Staaten': 212, 'von': 126, 'verbundene': 2, 'St': 2, 'Auch': 8, 'warf': 2, 'er': 10, 'Heidelberg': 2, 'das': 24, 'Haus': 12, 'Deutsch': 2, 'Europa': 58, 'und': 137, 'etwa': 4, 'für': 10, 'Neuwelt': 2, 'Species': 2, 'ursprünglich': 2, 'in': 160, 'sonst': 1, 'Nach': 6, 'Greenbergs': 2, 'Theorie': 2, '1987': 2, 'Afrika': 20, 'kamen': 6, 'hin': 1, 'vielfach': 2, 'nachgespielt': 2, 'Dreivierteltakt': 2, 'nicht': 9, 'Kirche': 16, 'erinnern': 2, 'am': 4, 'bis': 2, 'verschifft': 2, 'wurden': 8, 'Kontinenten': 2, 'Asien': 14, 'so': 3, 'wo': 5, 'die': 31, 'Kolonien': 10, 'waren': 3, 'dem': 11, 'Vereinigten': 2, 'millionenfach': 2, 'verbreitet': 6, 'Koch': 2, 'ins': 1, 'der': 34, 'damaligen': 2, 'sind': 6, 'Jahre': 4, 'aus': 27, 'Allianz': 4, 'ALBA': 4, 'im': 13, 'Jahr': 2, 'Siedlung': 4, 'zu': 16, 'konzentrieren': 2, 'Demokratie': 2, 'De': 2, 'la': 2, 'fuhr': 2, 'impfte': 2, 'Sammlern': 2,