In [1]:
from glob import glob
import logging, re, pdb, sqlite3, xml.sax
from time import sleep
from bz2 import BZ2File
from progressbar import ProgressBar, UnknownLength
logging.basicConfig()

bar = ProgressBar(max_value=UnknownLength)

Store synonym relations in sqlite:
+ fast
+ simple
+ easy to keep unique, undirected synonym pairs, assuming if A is synonym of B, B is synonym of A -> better wikisynnet connectivity.

In [2]:
conn = sqlite3.connect('fr_wiktionary_synonyms.db')
c = conn.cursor()

q_create_table = '''
CREATE TABLE IF NOT EXISTS synonyms (
    source_ TEXT NOT NULL,
    target TEXT NOT NULL,
    UNIQUE(source_, target) ON CONFLICT IGNORE
    );
'''
c.execute(q_create_table)

<sqlite3.Cursor at 0x7fe1a46dd810>

### Parse xml and extract synonym pairs

In [8]:
class WikiHandler(xml.sax.ContentHandler):
    
    def __init__(self):
        self.current_tag = ''
        self.title = ''
        self.text = ''
        self.i = 0
        
    def startElement(self, tag, attrs):
        self.current_tag = tag
    
    def characters(self, content):
        if self.current_tag == 'title':
            self.title += content
        elif self.current_tag == 'text':
            self.text += content
            
    def endElement(self, tag):
        if tag == 'page' \
        and not ':' in self.title \
        and '{{S|synonymes}}' in self.text:
            # get rid of Chinese
            french_start = re.search('{{langu[ag]*e\|fr}}', self.text)
            if french_start:
                self.text = self.text[french_start.span()[1]: ]
                french_end = re.search('{{langu[ag]*e\|[a-z]{2}}}', self.text)
                if french_end:
                    self.text = self.text[: french_end.span()[0]]
                # extract from wiki markup
                synonyms = '\n'.join(list(filter(lambda p: '{{S|synonymes}}' in p,
                                          re.split('\n+=+', self.text))))
                synonyms = [re.sub('\|.+|#fr', '', s)
                            for s in re.findall('\[\[([^\[\]]+)\]\]', synonyms)]
                
                synonyms = [tuple(sorted([self.title.strip(), s]))
                            for s in synonyms]
                
                c.executemany('INSERT INTO synonyms VALUES (?,?)', synonyms)
                conn.commit()
        if tag == 'page':
            self.title = ''
            self.text = '' 
            self.i += 1
            bar.update(self.i)

In [9]:
wiktionary_f = glob('*frwiktionary*')[0]
f = BZ2File(wiktionary_f)

parser = xml.sax.make_parser()
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
handler = WikiHandler()
parser.setContentHandler(handler)

In [10]:
parser.parse(f)

- 3535061 Elapsed Time: 1:30:57                                                

### Check it

In [12]:
def get_synonyms(w, cursor):
    q = f'''
    SELECT * FROM synonyms
    WHERE source_ = "{w}"
          OR target = "{w}";
    '''
    cursor.execute(q)
    return f'{w}: ' + ', '.join([w1 if w1 != w else w2 for w1, w2 in cursor.fetchall()])

In [19]:
print(get_synonyms('chat', c), '\n\n', get_synonyms('chien', c))

chat: chat domestique, greffier, Grippeminaud, minet, mistigri, Raminagrobis, matou, causette, clavardage, tchatche, chat chastel, palatine, jeu du loup 

 chien: cabot, cagouince, chienchien, clébard, clebs, corniaud, Fido, jaspineur, meilleur ami de l’homme, molosse, pitou, roquet, toutou, viausse, sergent, serre-joint, gardien, le meilleur ami de l’homme, policier, sex-appeal


In [20]:
conn.close()