In [1]:
import time

### Load Data

In [2]:
# Parse the source
with open('dataset/THUOCL_chengyu.txt', 'r', encoding='utf-8') as source:
    idioms = [i for i in map(lambda x: x.split()[0], source.readlines()) if len(i) <= 4]

nodes = set([i[0] for i in idioms] + [i[-1] for i in idioms])
edges = [{"src": i[0], "dst": i[-1], "idiom": i} for i in idioms]

#### Create Graph

In [3]:
from py2neo import Graph
graph = Graph("bolt://localhost:7687", password="12345678")

cql = "CREATE \n" + ",".join("(%s : Node {char: \"%s\"})\n"%(n, n) for n in nodes)
for e in edges:
    cql += ",(%s)-[:idiom {idiom: '%s'}]->(%s)\n"%(e['src'], e["idiom"], e['dst'])

start = time.time()
graph.run("match (n) detach delete n")
graph.run(cql)
print("created graph, time elapsed: %.2fs"%(time.time() - start))

created graph, time elapsed: 9.58s


___
### Query Graph
##### Find next word

In [4]:
next_word = lambda word: graph.run("match ()-[:idiom {idiom: '%s'}]->()-[e:idiom]->() return e.idiom"%word)

df_next = next_word("坚定不移").to_data_frame()

print("found %i idioms"%len(df_next))
df_next.head()

found 3 idioms


Unnamed: 0,e.idiom
0,移山填海
1,移风易俗
2,移花接木


##### Find path

In [5]:
def find_path(idiom, length, limit):
    cql = "match ()-[e:idiom {idiom: '%s'}]->()"%idiom
    path = "".join(["-[e%i:idiom]->()"%(i + 1) for i in range(length)])
    ret = ", ".join(["e%i.idiom as i%i"%(i + 1, i + 1) for i in range(length)])
    return graph.run(cql + path + " return e.idiom as i0, " + ret + " limit %i"%limit).to_data_frame()

In [6]:
start = time.time()
df_paths = find_path(idiom="坚定不移", length=5, limit=100000)

print("found %i paths, time elapsed: %.2fs"%(len(df_paths), time.time() - start))
df_paths.head(10)

found 6833 paths, time elapsed: 0.30s


Unnamed: 0,i0,i1,i2,i3,i4,i5
0,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别具一格
1,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别出新意
2,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别有洞天
3,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别开生面
4,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别无二致
5,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别有滋味
6,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别有天地
7,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别无长物
8,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别来无恙
9,坚定不移,移山填海,海阔天高,高耸入云,云泥之别,别有用心


In [7]:
start = time.time()
df_paths = find_path(idiom="坚定不移", length=10, limit=100000)

print("found %i paths, time elapsed: %.2fs"%(len(df_paths), time.time() - start))
df_paths.to_csv("output/坚定不移.csv", index=False)
df_paths.head(10)

found 100000 paths, time elapsed: 9.78s


Unnamed: 0,i0,i1,i10,i2,i3,i4,i5,i6,i7,i8,i9
0,坚定不移,移山填海,接连不断,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏心悦目,目不暇接
1,坚定不移,移山填海,接踵而来,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏心悦目,目不暇接
2,坚定不移,移山填海,接踵而至,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏心悦目,目不暇接
3,坚定不移,移山填海,接二连三,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏心悦目,目不暇接
4,坚定不移,移山填海,接风洗尘,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏心悦目,目不暇接
5,坚定不移,移山填海,问鼎中原,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏罚分明,明知故问
6,坚定不移,移山填海,问长问短,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏罚分明,明知故问
7,坚定不移,移山填海,问寒问暖,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏罚分明,明知故问
8,坚定不移,移山填海,问心无愧,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏罚分明,明知故问
9,坚定不移,移山填海,话里有话,海阔天高,高耸入云,云泥之别,别具一格,格杀勿论,论功行赏,赏罚分明,明白如话
