## Creating synonym and not synonym dataset using wordnet

First let's create a dataset of pairs of words that are synonyms or not synonyms using wordnet's synsets.

*Note*: "not synonyms" is not equivalent of antonyms, it simply means selecting any word `w2` in `(w1, w2)` pair in which the word `w2` is not in the set of synonyms to `w1`, i.e. we are simply performing **negative sampling**. 

In [None]:
from itertools import combinations
import random

import networkx as nx 
from nltk.corpus import wordnet as wn
import matplotlib.pyplot as plt


In [None]:
def get_synsets(part_of_speeches=None, verbose=True):
    """
    returns a dictionary where key is a particular part of speech
    and value is the list of all synsets in that POS, if default
    `part_of_speeches` is `None`, will use, verb, noun and adjectives.
    """
    if part_of_speeches is None:
        part_of_speeches={'verb': 'v', 'noun': 'n', 'adjective': 'a'}
        
    pos_synsets = dict()
    for name, pos in part_of_speeches.items():
        pos_synsets[name] = list(wn.all_synsets(pos))
        if verbose:
            print(f"found {len(pos_synsets[name])} synsets for {name}")
    return pos_synsets

In [None]:
pos_synsets = get_synsets()

In [None]:
def is_single_word(word):
    """
    For removing lemma names that contains multiple words, 
    separated by `-` or `_` 
    """
    return (('_' not in word) and ('-' not in word))

syn_graphs = dict()
all_words = dict()
for pos, val in pos_synsets.items():
    syn_graphs[pos] = nx.Graph()
    for synset in val:
        lemma_names = [x for x in synset.lemma_names() if is_single_word(x)]
        if len(lemma_names) > 1:
            syn_graphs[pos].add_edges_from(combinations(lemma_names, 2))
    all_words[pos] = set(syn_graphs[pos].nodes)
    print(f"Found {len(syn_graphs[pos].edges)} synonym pairs and {len(all_words[pos])} unique words in {pos}")
    

In [None]:
print(list(syn_graphs['verb'].neighbors('change')))

In [None]:
print(list(syn_graphs['noun'].neighbors('ocean')))

In [None]:
print(list(syn_graphs['adjective'].neighbors('large')))

In [None]:
def get_verb_subgraph(subset=['change', 'buy']):
    nodes = []
    for word in subset:
        nodes.append(word)
        nodes.extend(list(syn_graphs['verb'].neighbors(word)))
    subgraph = syn_graphs['verb'].subgraph(nodes)
    return subgraph

In [None]:
subgraph = get_verb_subgraph(subset=['sell', 'buy', 'change'])
pos=nx.spring_layout(subgraph, iterations=150, k=1.5)
nx.draw(subgraph, pos=pos)
nx.draw_networkx_labels(subgraph, pos=pos, font_size=10)
plt.show()
# plt.savefig('graph.png')