<a href="https://colab.research.google.com/github/rumen-cholakov/SemanticWeb/blob/master/word_net_random_walk/WNRW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!ls -laR

In [0]:
import pandas as pd
import numpy as np

import random
import copy

random.seed(a=42)

from functools import reduce
from itertools import groupby
from collections import defaultdict as dd

In [0]:
def load_graph_data(path:str):
  df = pd.read_csv(path, sep=' ', names=['u', 'v', 'e', 'q'])
  df['u'] = df['u'].apply(lambda x: x[2:])
  df['v'] = df['v'].apply(lambda x: x[2:])
  df['e'] = df['e'].apply(lambda x: x[2:])
  df = df.drop(labels=['q'], axis=1)

  return df

In [0]:
def load_dict_data(path:str):
  code_to_word = {}

  with open(path, 'r') as f:
    for line in f:
      line_parts = line.strip('\n').split(sep=' ')
      word = line_parts[0]
      codes = line_parts[1:]

      for code in codes:
        code_parts = code.split(sep=':')
        code_to_word[code_parts[0]] = (code_parts[1], word)

  return code_to_word

In [0]:
def build_graph(graph_data: pd.DataFrame):
  graph_dict = dd(list)
  for _, row in graph_data.iterrows():
    graph_dict[row['u']].append((row['e'], row['v']))

  return graph_dict

In [0]:
graph_data = load_graph_data('./WNRel/wn30d.txt')
code_dict = load_dict_data('./wnet30_v202.lex')

graph = build_graph(graph_data)

In [0]:
def get_random_start_node(graph: dd) -> str:
  keys = list(graph.keys())
  index = random.randint(0, len(keys) - 1)

  return keys[index]

def get_random_next_node(current: tuple, graph: dd):
  children = graph[current[1]]
  children = list(filter(lambda x: (x[0] != '30hyp' and
                                    x[0] != '30ant' and
                                    x[0] != '30cls' and
                                    x[0] != '30der' and
                                    x[0] != '30sim') or 
                                    random.randint(0,100) < 5, children))
  num_children = len(children)
  try:
    index = random.randint(0, num_children - 1)
  except ValueError:
    index = 0

  if num_children == 0:
    next_node = None
  else:
    next_node = children[index]

  return next_node

def build_random_sequence(graph: dd):
  local_graph = copy.copy(graph)
  sequence = []
  sequence_lenght = random.randint(2, 12)
  fail_stop = 2 * sequence_lenght

  start = get_random_start_node(local_graph)
  current = ('', start)
  sequence.append(current)

  while (len(sequence) < sequence_lenght) and fail_stop > 0:
    next_node = get_random_next_node(current, local_graph)
    local_graph.pop(current[1], None)

    if next_node is not None:
      sequence.append(next_node)
      current = next_node

    fail_stop -= 1

  return sequence

def filter_sequence(sequence: list):
  seq = sequence
  seq_len = len(sequence)
  rels = map(lambda x: x[0], sequence)
  words = map(lambda x: x[1], sequence)

  cnt = dd(int)
  for code in words:
    cnt[code] += 1

  if any([not (2 <= seq_len <= 12),
          any(map(lambda x: len(list(x[1])) > 3, groupby(rels))),
          any(map(lambda x: len(list(x[1])) > 1, groupby(words))),
          any(map(lambda x: cnt[x] / seq_len > 0.5, cnt.keys())),
          (seq_len * random.random() < 0.8)]):
    seq = None  

  return seq

def decode_sequence(sequence: list):
  decoded = [code_dict[word[1]][1] for word in sequence]
  return ' '.join(decoded).replace('_', ' ').capitalize() + '\n'

def create_corpus(graph: dd, lenght: int = 1_000):
  corpus = []

  while (len(corpus) < lenght):
    sequence = build_random_sequence(graph)
    filtered = filter_sequence(sequence)
    decoded = ''

    if filtered is not None:
      decoded = decode_sequence(filtered)
      
    if len(decoded) > 0:
      corpus.append(decoded)

  return corpus

def store_corpus(path: str, corpus: list):
  with open(path, 'w') as f:
    f.writelines(corpus)


In [9]:
corpus = create_corpus(graph, 2000)
store_corpus('./corpus.txt', corpus)

!cat ./corpus.txt

Salient line of battle war machine militaristic
Tuscan tuscany
Pierrot fictitious character
Largeness large size small dinky
Pilot fly
Save purchase buy purchase
Rockfish sebastodes
Pineapple pineapple plant genus ananas
Hebephrenic schizophrenia hebephrenic
Signaling signalize
Powderiness small-grained
Levy en masse recruit
Marquis de condorcet philosopher philosophical philosophy philosopher philosophical
Ratiocination syllogism syllogistic syllogism
Levant geographical region
Schopenhauer philosopher eclecticist
Huig de groot legal expert
Bellingham washington usa north american nation north america continent continental
Unit unitize split up divisible
Stretch widen extensive largeness large
Thistle family compositae order campanulales
Dialectician dialectic dialectical dialectic dialectical
Numbly numb
Suggestion suggest suggestible
Pass judgment evaluative
Assiduously sedulous
Tumbling tumble
Major premiss syllogism syllogize
William lloyd garrison eman