In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pickle
import wasp
from tqdm.autonotebook import tqdm
from itertools import chain
from IPython.display import display, Markdown

  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
with open(wasp.get_data_path('sem_graph', 'concept_net.pkl'), "rb") as fin:
    cn = pickle.load(fin)
cn_tokens = list(cn.keys())

In [4]:
cn_valid_tokens = list(filter(lambda x: cn[x][0].get("error", {}).get("status", -1) != 404, cn_tokens))

In [5]:
sum([len(cn[x]) for x in cn_valid_tokens])

5936

In [6]:
display(Markdown(f"### There are {len(cn_valid_tokens)} nodes found in ConceptNet"))

### There are 4055 nodes found in ConceptNet

In [7]:
cn_valid_tokens[0]

'鞋子'

In [8]:
def retrieve_edge(edge_x):    
    rel_type = edge_x["rel"]["@type"]
    end_lang = edge_x["end"].get("language")    
    if rel_type != "Relation" or end_lang != "zh":
        return None
    else:
        rel_label = edge_x["rel"]["label"]
        end = edge_x["end"]["label"]
        weight = edge_x.get("weight", 0)
        return (rel_label, end, weight)

In [9]:
rel_data = {}
for tok in cn_valid_tokens:
    edge_iter = chain.from_iterable(x["edges"] for x in cn[tok])
    rel_iter = map(retrieve_edge, edge_iter)
    rel_iter = filter(lambda x: x and x[2] > 1, rel_iter)
    rel_data[tok] = list(set(rel_iter))

In [10]:
edge_count = sum([len(x) for x in rel_data.values()])
display(Markdown(f"### There are {edge_count} relations retrieved from ConceptNet"))

### There are 38713 relations retrieved from ConceptNet

In [11]:
import networkx  as nx

In [12]:
G = nx.Graph()

In [13]:
for start_x, edges in rel_data.items():
    G.add_node(start_x)
    for edge_x in edges:   
        rel_label, end_x, _ = edge_x
        G.add_node(end_x)
        G.add_edge(start_x, end_x, rel=rel_label)            

In [14]:
n_largest_compo = max(len(x) for x in nx.connected_components(G))

In [15]:
display(Markdown(f"The Graph from conceptnet has \n"
        f"* {len(G.nodes)} nodes, {len(G.edges)} edges \n"
        f"* {nx.number_connected_components(G)} islands \n"
        f"* {len(list(nx.isolates(G)))} of which has single nodes, \n"
        f"* The largest island has {n_largest_compo} nodes"))

The Graph from conceptnet has 
* 11492 nodes, 24207 edges 
* 2647 islands 
* 2439 of which has single nodes, 
* The largest island has 8793 nodes

In [16]:
graph_1_path = wasp.get_data_path("sem_graph", "graph_stage_1.pkl")
with open(graph_1_path, "wb") as fout:
    pickle.dump(G, fout)

# Query FastText

In [17]:
import pickle

In [18]:
with open(wasp.get_resource_path("", "gensim_kv_fasttext_tc.pkl"), "rb") as fin:
    fasttext = pickle.load(fin)

In [19]:
fasttext.similarity("醫生", "梨子")

0.8844398365954318

In [20]:
from itertools import combinations
vocab = fasttext.vocab
cn_vocab = [x for x in cn.keys() if x in vocab]
print(f"cn_vocab: {len(cn_vocab)}")
assoc = {}
n_comb = len(cn_vocab) * (len(cn_vocab)-1)/2
for tok_x, tok_y in tqdm(combinations(cn_vocab, 2), total=n_comb):    
    if not(tok_x in vocab and tok_y in vocab):
        continue
    assoc[(tok_x, tok_y)] = fasttext.similarity(tok_x, tok_y)

cn_vocab: 4408


HBox(children=(FloatProgress(value=0.0, max=9713028.0), HTML(value='')))




In [21]:
with open(wasp.get_data_path("sem_graph", "cn_fasttext_assoc.pkl"), "wb") as fin:
    pickle.dump(assoc, fin)

In [22]:
assoc_values = list(assoc.values())

In [None]:
import numpy as np
np.quantile(assoc_values, [.80, .90, .95])
assoc_crit = np.quantile(assoc_values, [.80])[0]

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
density = gaussian_kde(assoc_values)
xs = np.linspace(0,1,100)
plt.plot(xs, density(xs))

## add fasttext assoc in graph

In [None]:
graph_1_path = wasp.get_data_path("sem_graph", "graph_stage_1.pkl")
with open(graph_1_path, "rb") as fin:
    G = pickle.load(fin)

with open(wasp.get_data_path("sem_graph", "cn_fasttext_assoc.pkl"), "rb") as fin:
    assoc = pickle.load(fin)

In [None]:
for tok_tup, sim in tqdm(assoc.items()):
    if sim > assoc_crit:
        G.add_edge(*tok_tup, label="fasttext")

In [None]:
display(Markdown(f"The Graph from conceptnet has \n"
        f"* {len(G.nodes)} nodes, {len(G.edges)} edges \n"
        f"* {nx.number_connected_components(G)} islands \n"
        f"* {len(list(nx.isolates(G)))} of which has single nodes, \n"
        f"* The largest island has {n_largest_compo} nodes"))

In [None]:
graph_2_path = wasp.get_data_path("sem_graph", "graph_stage_2.pkl")
with open(graph_2_path, "wb") as fout:
    pickle.dump(G, fout)