# 3_1_Calc_Edit_Distance.ipynb
Calculates the edit distance between Paths.
### input
- 2_Paths_from_KEGG_Pathway_datafile/Paths_from_KEGG_Pathway.csv : A file with Path and Target associated

### output
- 3_Calc_Edit_Distance/output/Calc_Edit_Distance.csv.gz : A file that calculates the edit distance between Paths
- 3_Calc_Edit_Distance/output/KEGG_ID_index.csv : A file with Path, Path ID and hsa_map ID associated

In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
df_t = pd.read_csv('../2_Paths_from_KEGG_Pathway_datafile/Paths_from_KEGG_Pathway.csv',header = 0, index_col=0)

In [3]:
df_t

Unnamed: 0,HSA,hsa_map,order1,KEGG_id
0,3065,hsa05034,"[231, 340]",['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...
1,3066,hsa05034,"[231, 340]",['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...
2,55869,hsa05034,"[231, 340]",['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...
3,10013,hsa05034,"[231, 340]",['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...
4,9759,hsa05034,"[231, 340]",['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...
...,...,...,...,...
109253,9453,hsa00900,"[130, 125, 150, 134, 126, 173]","['hsa:9453', 'path:map00909', 'hsa:2339 hsa:23..."
109254,9453,hsa00900,"[130, 125, 150, 173]","['hsa:9453', 'path:map00909', 'hsa:2339 hsa:23..."
109255,9453,hsa00900,"[130, 125, 150, 126, 173]","['hsa:9453', 'path:map00909', 'hsa:2339 hsa:23..."
109256,9453,hsa00900,"[130, 125, 173]","['hsa:9453', 'path:map00909', 'path:hsa00100']"


In [4]:
df_order = df_t[['hsa_map', 'KEGG_id']].drop_duplicates().sort_values('hsa_map').groupby(['KEGG_id'])['hsa_map'].apply(list).reset_index().sort_values('KEGG_id').reset_index()

In [5]:
df_order

Unnamed: 0,index,KEGG_id,hsa_map
0,0,['hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:51...,[hsa05034]
1,1,"['hsa:10013 hsa:5071', 'hsa:23435', 'hsa:10213...",[hsa05014]
2,2,['hsa:100137049 hsa:11145 hsa:123745 hsa:15105...,[hsa00565]
3,3,['hsa:100137049 hsa:11145 hsa:123745 hsa:15105...,[hsa00565]
4,4,['hsa:100137049 hsa:11145 hsa:123745 hsa:15105...,[hsa00565]
...,...,...,...
67476,67476,"['hsa:9453', 'path:map00909', 'hsa:2339 hsa:23...",[hsa00900]
67477,67477,"['hsa:9453', 'path:map00909', 'hsa:2339 hsa:23...",[hsa00900]
67478,67478,"['hsa:9453', 'path:map00909', 'hsa:2339 hsa:23...",[hsa00900]
67479,67479,"['hsa:9453', 'path:map00909', 'path:hsa00100']",[hsa00900]


In [6]:
df_order['hsa_map'] = df_order['hsa_map'].astype(str)

In [7]:
df_order.to_csv('output/KEGG_ID_index.csv',encoding = 'utf-8')
df_order1 = df_order.copy()[['KEGG_id', 'index']].drop_duplicates()

In [8]:
df_order['KEGG_id'] = df_order['KEGG_id'].apply(eval)

In [9]:
df_order

Unnamed: 0,index,KEGG_id,hsa_map
0,0,[hsa:10013 hsa:10014 hsa:3065 hsa:3066 hsa:515...,['hsa05034']
1,1,"[hsa:10013 hsa:5071, hsa:23435, hsa:10213 hsa:...",['hsa05014']
2,2,[hsa:100137049 hsa:11145 hsa:123745 hsa:151056...,['hsa00565']
3,3,[hsa:100137049 hsa:11145 hsa:123745 hsa:151056...,['hsa00565']
4,4,[hsa:100137049 hsa:11145 hsa:123745 hsa:151056...,['hsa00565']
...,...,...,...
67476,67476,"[hsa:9453, path:map00909, hsa:2339 hsa:2342, h...",['hsa00900']
67477,67477,"[hsa:9453, path:map00909, hsa:2339 hsa:2342, p...",['hsa00900']
67478,67478,"[hsa:9453, path:map00909, hsa:2339 hsa:2342, p...",['hsa00900']
67479,67479,"[hsa:9453, path:map00909, path:hsa00100]",['hsa00900']


In [10]:
def levenshtein_distance(x):
    a = x[0]
    b = x[1]
    m = np.zeros((len(a) + 1, len(b) + 1), np.int32)
    for i in range(len(a) + 1):
        m[i,0] = i
    for j in range(len(b) + 1):
        m[0,j] = j
    for i in range(1, len(a) + 1):
        for j in range(1, len(b) + 1):
            if a[i - 1] == b[j - 1]:
                x = 0
            else:
                x = 1
            m[i,j] = min(m[i - 1,j] + 1, m[i, j - 1] + 1, m[i - 1,j - 1] + x)
    return m[-1,-1] / max(len(a), len(b))

In [11]:
def leven_without_1(series):
    main_path = series[0]
    sub_path = series[1]
    if len(set(main_path)&set(sub_path))==0:
        return 'NaN'
    else:
        return series

In [11]:
df_all = pd.DataFrame()
for down_index, i in zip(df_order['index'][1:], df_order['KEGG_id'][1:]):
    use_list_b = list(df_order['KEGG_id'][:down_index])

    use_list_a = [i]
    v = list(itertools.product(use_list_a, use_list_b))
    df_i = pd.DataFrame(v)
    df_i['li'] = v
    df_i['li'] = df_i['li'].apply(leven_without_1)
    df_i = df_i[df_i['li'] != 'NaN']
    df_i['Levenshtein_ratio'] = df_i['li'].apply(levenshtein_distance)
    a = df_i[df_i['Levenshtein_ratio'] != 1][[0,1,'Levenshtein_ratio']]
    a['down_index_new_0'] = down_index
    a[1] = a[1].apply(str)
    a = pd.merge(a, df_order1, left_on = 1, right_on = 'KEGG_id').rename(columns = {'index': 'down_index_new_1'}).drop(columns = [0, 1, 'KEGG_id'])
    a['value'] = 1- a['Levenshtein_ratio']
    a = a.drop(columns = 'Levenshtein_ratio')
    df_all = pd.concat([df_all, a])
    if down_index % 1000 == 0:
        df_all.to_csv('ex_pathway_file/pathway_'+str(down_index)+'.csv.gz',encoding = 'utf-8', compression='gzip')
        df_all = pd.DataFrame()
    else:
        pass

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\nakamura\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-04a0327f1e3e>", line 11, in <module>
    df_i['Levenshtein_ratio'] = df_i['li'].apply(levenshtein_distance)
  File "C:\Users\nakamura\anaconda3\lib\site-packages\pandas\core\series.py", line 4138, in apply
    mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "pandas\_libs\lib.pyx", line 2467, in pandas._libs.lib.map_infer
  File "<ipython-input-10-30bd13f80b1b>", line 15, in levenshtein_distance
    m[i,j] = min(m[i - 1,j] + 1, m[i, j - 1] + 1, m[i - 1,j - 1] + x)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\nakamura\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2061, in showtraceback
    stb = value._render_traceback_()
AttributeErr

TypeError: object of type 'NoneType' has no len()

In [None]:
df_all

In [None]:
df_all.to_csv('ex_pathway_file/pathway_'+str(down_index)+'.csv.gz',encoding = 'utf-8', compression='gzip')

In [2]:
df_a = pd.DataFrame()
for i in range(1, 68):
    df_read = pd.read_csv('ex_pathway_file/pathway_'+str(i*1000)+'.csv.gz',header = 0, index_col=0)
    df_a = pd.concat([df_a, df_read])
df_read = pd.read_csv('ex_pathway_file/pathway_'+str(67480)+'.csv.gz',header = 0, index_col=0)
df_a = pd.concat([df_a, df_read]).reset_index(drop = True)

  mask |= (ar1 == a)


In [3]:
df_a.to_csv('output/Calc_Edit_Distance.csv.gz',encoding = 'utf-8', compression='gzip')