In [1]:
import pandas as pd
# Load the Excel file
excel_file = 'datasets/culture/table.xlsx'
sheets = ['도자36', '회화47', '기와37', '가구39', '의상41']

# Initialize an empty DataFrame
df_final = pd.DataFrame(columns=['org_id', 'remap_id', 'freebase_id'])
df_list = [pd.read_excel(excel_file, sheet) for sheet in sheets]

for i, df in enumerate(df_list):
#     df = df.copy()
    df['org_id'] = sheets[i][:-2] + df['연번'].astype(str)
#     df['remap_id'] = df.index
    df['freebase_id'] = df['소장품번호']
    df_final = pd.concat([df_final, df[['org_id', 'freebase_id']]], ignore_index=True)

# Remove duplicates based on 'org_id'
df_final.drop_duplicates(subset='freebase_id', keep='first', inplace=True)
df_final['remap_id']=range(len(df_final))

# Write to a text file
df_final.to_csv('datasets/culture/item_list.txt', sep=' ', index=False)


In [2]:
df_final

Unnamed: 0,org_id,remap_id,freebase_id
0,도자1,0,덕수 6294
3,도자2,1,본관 2029
5,도자3,2,덕수 4487
7,도자4,3,덕수 6238
9,도자5,4,개성 2
...,...,...,...
224,의상37,201,증 6493
225,의상38,202,증 6492
226,의상39,203,동원 4937
227,의상40,204,증 7685


In [3]:
df_whole = pd.concat(df_list,\
                     axis=0, ignore_index=True\
                    ).drop(['유물명','연번','다른명칭', '한자명', '원천유물', '소장품설명','freebase_id'], axis=1)

In [4]:
df_whole

Unnamed: 0,대분류,소분류,국적,시대,분류1,분류2,분류3,분류4,재질1,재질2,작가,소장품번호,org_id
0,식물,꽃,한국,조선,식,음식기,저장운반,항아리,도자기,백자,,덕수 6294,도자1
1,식물,매화,한국,조선,식,음식기,저장운반,항아리,도자기,백자,,덕수 6294,도자1
2,식물,대나무,한국,조선,식,음식기,저장운반,항아리,도자기,백자,,덕수 6294,도자1
3,식물,과일,한국,조선,식,음식기,저장운반,항아리,도자기,백자,,본관 2029,도자2
4,식물,포도,한국,조선,식,음식기,저장운반,항아리,도자기,백자,,본관 2029,도자2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,상상의 동물,용,한국,광복이후,의,의류,궁중복,여자대례복,사직,견,,증 6493,의상37
225,식물,모란,한국,광복이후,의,의류,궁중복,여자대례복,사직,견,,증 6492,의상38
226,,,한국,광복이후,의,의류,의례복,여자혼례복,사직,견,,동원 4937,의상39
227,,,한국,광복이후,의,의류,평상복,여자하의류,사직,견,,증 7685,의상40


In [5]:
relation_list = ['대분류', '소분류', '국적', '시대', '분류1', '분류2', '분류3', '분류4', '재질1', '재질2', '작가']
relation_dict = {}
for i, rel in enumerate(relation_list):
    relation_dict[rel] = i

relation_dict

{'대분류': 0,
 '소분류': 1,
 '국적': 2,
 '시대': 3,
 '분류1': 4,
 '분류2': 5,
 '분류3': 6,
 '분류4': 7,
 '재질1': 8,
 '재질2': 9,
 '작가': 10}

In [6]:
# First, get the unique values for '소장품번호'
unique_소장품번호 = pd.Series(df_whole['소장품번호'].dropna().unique())

# Then, get the unique values for the other columns
other_columns = pd.Series(df_whole[relation_list].values.reshape(-1)).dropna().unique()

# Convert other_columns to a pandas Series
other_columns_series = pd.Series(other_columns)

# Concatenate the two series, ensuring '소장품번호' comes first
entity_list = pd.concat([unique_소장품번호, other_columns_series])

entity_dict = {}
for i, ent in enumerate(entity_list):
    entity_dict[ent] = i

entity_dict


{'덕수 6294': 0,
 '본관 2029': 1,
 '덕수 4487': 2,
 '덕수 6238': 3,
 '개성 2': 4,
 '덕수 2656': 5,
 '본관 12419': 6,
 '덕수 2990': 7,
 '증 7125': 8,
 '증 7126': 9,
 '증 7124': 10,
 '동원 883': 11,
 '동원 881': 12,
 '신수 8088': 13,
 '신수 28460': 14,
 '신수 4522': 15,
 '동원 330': 16,
 '부여 31933': 17,
 '신수 901': 18,
 '덕수 452': 19,
 '신안 6678': 20,
 '신안 11': 21,
 '동원 1220': 22,
 '덕수 2960': 23,
 '김천 7092': 24,
 '제주 1281': 25,
 '덕수 6231': 26,
 '본관 12226': 27,
 '덕수 2411': 28,
 '덕수 5636': 29,
 '개성 1': 30,
 '본관 10130': 31,
 '덕수 20': 32,
 '본관 10075': 33,
 '동원 1363': 34,
 '동원 351': 35,
 '동원 886': 36,
 '접수 866': 37,
 '본관 6504': 38,
 '덕수 4336': 39,
 '덕수 946': 40,
 '광주 55184': 41,
 '본관 8404': 42,
 '덕수 3305': 43,
 '본관 10970': 44,
 '덕수 1313': 45,
 '덕수 2312': 46,
 '덕수 719': 47,
 '덕수 3672': 48,
 '본관 6504 - 1': 49,
 '덕수 5539': 50,
 '덕수 3145': 51,
 '덕수 2333': 52,
 '덕수 5321': 53,
 '동원 2626': 54,
 '동원 2307': 55,
 '동원 2318': 56,
 '동원 2870': 57,
 '덕수 1155': 58,
 '광주 19795': 59,
 '광주 19796': 60,
 '광주 19791': 61,
 '광주 19793': 62,
 '광주 1979

In [7]:
relation_dict.keys()

dict_keys(['대분류', '소분류', '국적', '시대', '분류1', '분류2', '분류3', '분류4', '재질1', '재질2', '작가'])

In [8]:
list(relation_dict.keys())

['대분류', '소분류', '국적', '시대', '분류1', '분류2', '분류3', '분류4', '재질1', '재질2', '작가']

In [9]:
entity_dict[df_whole[df_whole['소장품번호'] == '민속 56977'][list(relation_dict.keys())[0]].item()]

238

In [10]:
relation_list

['대분류', '소분류', '국적', '시대', '분류1', '분류2', '분류3', '분류4', '재질1', '재질2', '작가']

In [11]:
knowledge_graph = []
for head in df_whole['소장품번호'] :
    for rel in relation_list :
        for ent in df_whole[df_whole['소장품번호'] == head][rel].dropna() :
            tail = entity_dict[ent]
            knowledge_graph.append([entity_dict[head], relation_dict[rel], tail])
        
knowledge_graph

[[0, 0, 206],
 [0, 0, 206],
 [0, 0, 206],
 [0, 1, 207],
 [0, 1, 216],
 [0, 1, 217],
 [0, 2, 208],
 [0, 2, 208],
 [0, 2, 208],
 [0, 3, 209],
 [0, 3, 209],
 [0, 3, 209],
 [0, 4, 210],
 [0, 4, 210],
 [0, 4, 210],
 [0, 5, 211],
 [0, 5, 211],
 [0, 5, 211],
 [0, 6, 212],
 [0, 6, 212],
 [0, 6, 212],
 [0, 7, 213],
 [0, 7, 213],
 [0, 7, 213],
 [0, 8, 214],
 [0, 8, 214],
 [0, 8, 214],
 [0, 9, 215],
 [0, 9, 215],
 [0, 9, 215],
 [0, 0, 206],
 [0, 0, 206],
 [0, 0, 206],
 [0, 1, 207],
 [0, 1, 216],
 [0, 1, 217],
 [0, 2, 208],
 [0, 2, 208],
 [0, 2, 208],
 [0, 3, 209],
 [0, 3, 209],
 [0, 3, 209],
 [0, 4, 210],
 [0, 4, 210],
 [0, 4, 210],
 [0, 5, 211],
 [0, 5, 211],
 [0, 5, 211],
 [0, 6, 212],
 [0, 6, 212],
 [0, 6, 212],
 [0, 7, 213],
 [0, 7, 213],
 [0, 7, 213],
 [0, 8, 214],
 [0, 8, 214],
 [0, 8, 214],
 [0, 9, 215],
 [0, 9, 215],
 [0, 9, 215],
 [0, 0, 206],
 [0, 0, 206],
 [0, 0, 206],
 [0, 1, 207],
 [0, 1, 216],
 [0, 1, 217],
 [0, 2, 208],
 [0, 2, 208],
 [0, 2, 208],
 [0, 3, 209],
 [0, 3, 209],
 [0, 3

In [12]:
index_to_entity_dict = {y: x for x, y in entity_dict.items()}
index_to_entity_dict

{0: '덕수 6294',
 1: '본관 2029',
 2: '덕수 4487',
 3: '덕수 6238',
 4: '개성 2',
 5: '덕수 2656',
 6: '본관 12419',
 7: '덕수 2990',
 8: '증 7125',
 9: '증 7126',
 10: '증 7124',
 11: '동원 883',
 12: '동원 881',
 13: '신수 8088',
 14: '신수 28460',
 15: '신수 4522',
 16: '동원 330',
 17: '부여 31933',
 18: '신수 901',
 19: '덕수 452',
 20: '신안 6678',
 21: '신안 11',
 22: '동원 1220',
 23: '덕수 2960',
 24: '김천 7092',
 25: '제주 1281',
 26: '덕수 6231',
 27: '본관 12226',
 28: '덕수 2411',
 29: '덕수 5636',
 30: '개성 1',
 31: '본관 10130',
 32: '덕수 20',
 33: '본관 10075',
 34: '동원 1363',
 35: '동원 351',
 36: '동원 886',
 37: '접수 866',
 38: '본관 6504',
 39: '덕수 4336',
 40: '덕수 946',
 41: '광주 55184',
 42: '본관 8404',
 43: '덕수 3305',
 44: '본관 10970',
 45: '덕수 1313',
 46: '덕수 2312',
 47: '덕수 719',
 48: '덕수 3672',
 49: '본관 6504 - 1',
 50: '덕수 5539',
 51: '덕수 3145',
 52: '덕수 2333',
 53: '덕수 5321',
 54: '동원 2626',
 55: '동원 2307',
 56: '동원 2318',
 57: '동원 2870',
 58: '덕수 1155',
 59: '광주 19795',
 60: '광주 19796',
 61: '광주 19791',
 62: '광주 19793',
 63: '광주 

In [13]:
df_5hop = pd.read_excel('datasets/culture/5hop.xlsx', skiprows=2)['5hop list']
df_5hop

0       기와11, 회화39, 의상38, 가구15, 도자27
1       의상35, 기와13, 가구18, 회화39, 기와21
2        도자5, 의상36, 기와17, 기와14, 가구15
3        기와20, 의상37, 회화39, 가구18, 도자5
4       기와15, 기와16, 의상36, 기와19, 회화39
                    ...             
1830     기와13, 가구21, 회화1, 기와15, 회화24
1831     회화27, 기와20, 가구21, 기와13, 회화1
1832     회화3, 가구21, 기와13, 가구18, 회화10
1833       회화2, 회화1, 회화3, 회화27, 기와15
1834     회화4, 기와21, 기와16, 회화18, 기와14
Name: 5hop list, Length: 1835, dtype: object

In [14]:
def func(item):
    matched_items = df_final[df_final['org_id']==item]['freebase_id']
    if not matched_items.empty:
        return entity_dict[matched_items.iloc[0]]
    else:
        return None

def space_comma_remove(item) :
    return item.replace(' ', '').replace(',','')

func('도자9')

8

In [15]:
interaction_data = []
for i,row in enumerate(df_5hop) :
    interaction_data.append(list(map(func, list(map(space_comma_remove, row.split(', '))) )))

interaction_data

[[99, 86, 202, 140, 28],
 [199, 101, 143, 86, 109],
 [4, 200, 105, 102, 140],
 [108, 201, 86, 143, 4],
 [103, 104, 200, 107, 86],
 [74, 102, 151, 133, 202],
 [105, 100, 106, 163, 101],
 [140, 108, 107, 202, 151],
 [103, 202, 140, 133, 101],
 [104, 163, 74, 99, 202],
 [43, 77, 80, 45, 78],
 [38, 76, 50, 58, 48],
 [42, 49, 56, 79, 70],
 [44, 43, 78, 80, 45],
 [57, 79, 73, 44, 76],
 [121, 122, 123, 117, 125],
 [117, 123, 121, 122, 91],
 [123, 121, 122, 125, 91],
 [125, 122, 91, 123, 121],
 [91, 123, 122, 117, 121],
 [177, 192, 188, 199, 170],
 [191, 185, 178, 190, 194],
 [186, 166, 184, 200, 165],
 [189, 176, 193, 197, 196],
 [192, 197, 194, 193, 178],
 [187, 184, 170, 185, 199],
 [189, 191, 176, 200, 195],
 [166, 190, 178, 188, 196],
 [177, 197, 165, 192, 186],
 [194, 193, 187, 184, 179],
 [143, 155, 147, 140, 163],
 [151, 144, 132, 137, 150],
 [136, 162, 148, 138, 153],
 [142, 149, 161, 154, 140],
 [138, 132, 143, 147, 161],
 [154, 140, 151, 137, 163],
 [136, 150, 155, 142, 144],
 [148,

In [16]:
import numpy as np
interaction_train = np.array(interaction_data)[:,:3]
interaction_test = np.array(interaction_data)[:,3:]

In [17]:
pd.DataFrame(interaction_train).to_csv('datasets/culture/train.txt', sep=' ', header=False)
pd.DataFrame(interaction_test).to_csv('datasets/culture/test.txt', sep=' ', header=False)
pd.DataFrame(knowledge_graph).to_csv('datasets/culture/kg_final.txt', sep=' ', index=False, header=False)

In [18]:
len(knowledge_graph)

2961

In [19]:
len(entity_list)

410

In [21]:
np.array(interaction_data).shape

(1835, 5)

In [22]:
for i in range(np.array(interaction_data).shape[0]) :
    for j in range(np.array(interaction_data).shape[1]) :
        if not index_to_entity_dict[np.array(interaction_data)[i,j]] in df_whole['소장품번호'].values :
            print('Data error')