# Dictionary generator
Here I'd try to make a big dictionary  

In [1]:
import re
import datetime

import numpy as np
import matplotlib.pyplot as plt

from PIL import Image, ImageDraw, ImageFont

%matplotlib inline

# Data

## English word-Chinese character dictionary
English-Chinese dictionary is taken from https://www.mdbg.net/chinese/dictionary?page=cedict

Format of the dictionary line:

```
Traditional Simplified [pin1 yin1] /English equivalent 1/equivalent 2/

```

Example of the line:
```
雀鳥 雀鸟 [que4 niao3] /bird/
```

One word to one character dictionary

In [None]:
word2hier = dict()
for line in open('data/cedict_ts.u8', 'r', encoding='utf-8'):
    if re.match(r'#.*', line):
#   Skip comments
        continue
    
    ch_trad, ch_simp, line = re.split(r' ', line, maxsplit=2)
#   We use simplified characters
    ch = ch_simp
    
    if not re.match(u'[\u4e00-\u9fff]\Z', ch):
#   Skip if ch contains more than one character
        continue
    
    _, eng_eqs = re.split(r'] /', line, maxsplit=1)
    eng_eqs = eng_eqs.split('/')[:-1]
    
#   Select only equivalents containing only one word
    eng_eqs = [eq for eq in eng_eqs if re.match(r'\w+\Z', eq)]
    
    for eng_eq in eng_eqs:
        word2hier[eng_eq] = ch

In [None]:
len(word2hier.keys())

## Generate images from unicode srtings

In [None]:
def draw_char(ch, size=64, bw=False):
    img = Image.new("RGB",[size,size], "black")
    
    font = ImageFont.truetype('data/CODE2000.TTF', size, encoding="unic")
    ImageDraw.Draw(img).text((0, -size * 0.125), ch, fill="white", font=font)
    
    img = np.asarray(img)
    bwing = np.zeros((img.shape[0], img.shape[1]))

    if bw:        
        for i in range(img.shape[0]):
            for j in range(img.shape[1]):
                if img[i, j, 0] > 0:
                    bwing[i, j] = 1
    else:        
        for i in range(img.shape[0]):
            for j in range(img.shape[1]):
                    bwing[i, j] = img[i, j, 0]
    img = bwing
    return img

In [None]:
plt.imshow(draw_char(word2hier["moon"]), cmap = plt.get_cmap('gray'), interpolation="nearest")
plt.show()

## Reading word2vec
word2vec set is taken from http://nlp.stanford.edu/data/glove.42B.300d.zip

Containes pre-trained 300-dimensional vectors trained on Common Crawl (42B) dataset

In [None]:
word2vec = {}

lcount = 0
wcount = 0

for line in open('./data/glove.42B.300d.txt', 'r', encoding='utf-8'):
    word, vect = line.split(" ", maxsplit=1)
    
    if word in word2hier.keys():
        wcount += 1
        vect = [float(n) for n in vect[:-1].split(" ")]
        word2vec[word] = vect
    
    lcount += 1
    if (lcount % 10000 == 0):
        print("{} lines read, {} words found".format(lcount, wcount))
       
print("{} lines read, {} words found".format(lcount, wcount))

## Compose word to vector and character dictionary

In [None]:
char_size = 64

word2vec_char = {}
wcount = 0

for key in word2vec:
    word2vec_char[key] = {"vect": word2vec[key], "char": draw_char(word2hier[key], size=char_size).ravel()}
    
    wcount += 1
    if (wcount % 1000 == 0):
        print("{} words processed".format(wcount))
        
print("{} words processed".format(wcount))

# Write everything

In [None]:
timestamp = datetime.datetime.now().strftime("%d_%m_%y_%H_%m_%S")
filename = "./data/word2_{}_vector2_{}x{}hier_{}.txt".format(len(word2vec["cat"]), 
                                                            char_size, char_size, 
                                                            timestamp)
print("writing to file: {}".format(filename))
fld = open(filename, 'w')

lcount = 0
for key in word2vec_char:
    
    vector = " ".join([str(n) for n in word2vec_char[key]["vect"]])
    image = " ".join([str(n) for n in word2vec_char[key]["char"]])
    try:
        fld.write("{},{},{}\n".format(key, vector, image))
    except:
        print(key)
    
    lcount +=1
    if (lcount % 1000 == 0):
        print("{} lines wrote".format(lcount))
        
fld.close()
print("{} lines wrote".format(lcount))

# An attempt to generate more Data

## English words collocation-Chinese character dictionary

In [None]:
coll2hier = dict()
for line in open('data/cedict_ts.u8', 'r', encoding='utf-8'):
    if re.match(r'#.*', line):
#   Skip comments
        continue
    
    ch_trad, ch_simp, line = re.split(r' ', line, maxsplit=2)
#   We use simplified characters
    ch = ch_simp
    
    if not re.match(u'[\u4e00-\u9fff]\Z', ch):
#   Skip if ch contains more than one character
        continue
    
    _, eng_eqs = re.split(r'] /', line, maxsplit=1)
    eng_eqs = eng_eqs.split('/')[:-1]
    
#   Select only equivalents containing only one word
    eng_eqs = [eq for eq in eng_eqs if re.match(r'(\w+\s)*\w+\Z', eq)]
    
    for eng_eq in eng_eqs:
        coll2hier[eng_eq] = ch

In [None]:
len(coll2hier)

In [None]:
plt.imshow(draw_char(coll2hier["light a fire"]), cmap = plt.get_cmap('gray'), interpolation="nearest")
plt.show()