# Prepare data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/original_data.csv', delimiter='?', header=None)
df.columns = ['sign', 'reading', 'rank']

In [3]:
df.shape

(56008, 3)

In [4]:
df = df[df['rank'] <= 10000]

In [5]:
df.shape

(10000, 3)

## Split row with two writings

In [6]:
df[['sign', 'sign_2']] = df['sign'].str.partition(';',)[[0, 2]]

In [7]:
df[df['sign_2'] != ''].head(3)

Unnamed: 0,sign,reading,rank,sign_2
696,年轻,nian2'qing1,697,年青
3252,当作,dang1'zuo4,3253,当做
5053,看作,kan4'zuo4,5054,看做


In [8]:
df[['sign', 'reading', 'rank']].head(3)

Unnamed: 0,sign,reading,rank
0,的,de,1
1,是,shi4,2
2,在,zai4,3


In [9]:
df2 = df[(df['sign_2'] != '')][['sign_2', 'reading', 'rank']]
df2['sign'] = df2['sign_2']
df2.shape

(9, 4)

In [10]:
df = pd.concat([df[['sign', 'reading', 'rank']], df2[['sign', 'reading', 'rank']]], ignore_index=True)

In [11]:
df['rank'] = 1. / df['rank']

In [12]:
df.head()

Unnamed: 0,sign,reading,rank
0,的,de,1.0
1,是,shi4,0.5
2,在,zai4,0.333333
3,一,yi1,0.25
4,不,bu4,0.2


In [13]:
df.shape

(10009, 3)

In [14]:
weights = pd.Series(df['rank'].values, index=df['sign']).to_dict()

In [15]:
edges_dict = dict()
for word in weights.keys():
    for sign in word:
        if sign not in edges_dict:
            edges_dict[sign] = list()
        edges_dict[sign].append(word)

In [16]:
import itertools

edges = list()
for sign, word_list in edges_dict.items():
    for source, target in itertools.combinations_with_replacement(word_list, 2):
        edges.append((source, target, (weights[source] + weights[target]) / 2))

In [17]:
edges[0:3]

[('的', '的', 1.0),
 ('的', '有的', 0.5016556291390728),
 ('的', '目的', 0.500814332247557)]

In [18]:
edges_df = pd.DataFrame(edges)
edges_df.columns = ['Source', 'Target', 'WEIGHT']

In [19]:
edges_df_gb = edges_df.groupby(['Source', 'Target'], as_index=False).sum()

In [20]:
edges_df_gb.head()

Unnamed: 0,Source,Target,WEIGHT
0,一,一,0.25
1,一,一一,0.250264
2,一,一下,0.12587
3,一,一世,0.125085
4,一,一举,0.125086


In [21]:
edges_df_gb.to_csv("data/edge_list.csv", sep=';',index=False)