In [1]:
import pandas as pd
import os
import datetime
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
from tqdm import tqdm
import math
from functools import reduce


In [2]:
df3 = pd.read_pickle('uspg_cpc_weight_2001-2023.pkl') #불러오기

In [3]:
df3

Unnamed: 0,pubnum,symbol,cpcsc,cpcmg,counts,weight,PUBLICATION_NUMBER,YEAR
0,6167569,A42B1/006,A42B,A42B1,7,0.142857,6167569,2001.0
1,6167569,A42B1/0182,A42B,A42B1,7,0.142857,6167569,2001.0
2,6167569,A42B1/22,A42B,A42B1,7,0.142857,6167569,2001.0
3,6167569,A45C9/00,A45C,A45C9,7,0.142857,6167569,2001.0
4,6167569,A45F3/00,A45F,A45F3,7,0.142857,6167569,2001.0
...,...,...,...,...,...,...,...,...
40466869,11596090,Y10T29/53174,Y10T,Y10T29,7,0.142857,11596090,2023.0
40466870,11596091,H05K13/0882,H05K,H05K13,4,0.250000,11596091,2023.0
40466871,11596091,H05K13/0061,H05K,H05K13,4,0.250000,11596091,2023.0
40466872,11596091,H05K13/0853,H05K,H05K13,4,0.250000,11596091,2023.0


In [4]:
# cpc symbol, mg, sc 갯수
nsymbols = len(df3['symbol'].unique())
nmgs = len(df3['cpcmg'].unique())
mscs = len(df3['cpcsc'].unique())
print(f'no. of symbols / mg / sc : {nsymbols} / {nmgs} / {mscs}')

no. of symbols / mg / sc : 235406 / 10467 / 672


In [5]:
# Sample 분석 범위 설정
## cpcsc로 10년치 계산
df_full = df3[(df3['YEAR']>2012) & (df3['YEAR']<2023)]

In [6]:
df_full

Unnamed: 0,pubnum,symbol,cpcsc,cpcmg,counts,weight,PUBLICATION_NUMBER,YEAR
12623641,8341762,F41H1/02,F41H,F41H1,3,0.333333,8341762,2013.0
12623642,8341762,A41D1/005,A41D,A41D1,3,0.333333,8341762,2013.0
12623643,8341762,A41D31/245,A41D,A41D31,3,0.333333,8341762,2013.0
12623644,8341763,A41D19/01588,A41D,A41D19,3,0.333333,8341763,2013.0
12623645,8341763,A63B71/148,A63B,A63B71,3,0.333333,8341763,2013.0
...,...,...,...,...,...,...,...,...
39938890,11540432,H01L21/68757,H01L,H01L21,11,0.090909,11540432,2022.0
39938891,11540432,H05F1/00,H05F,H05F1,11,0.090909,11540432,2022.0
39938892,11540432,H05F1/02,H05F,H05F1,11,0.090909,11540432,2022.0
39938893,11540433,H05K9/0088,H05K,H05K9,2,0.500000,11540433,2022.0


In [7]:
## split dataset
### 분석 범위 기간
year_ranges = []
for year in list(df_full['YEAR'].unique()):
    year_tuple  = (year, year)
    year_ranges.append(year_tuple)

In [8]:
# 필요한 데이터만 분석용으로 남기기
def filter_and_group_data(df, year_tuple):
    tmp_df = df[(df['YEAR'] >= year_tuple[0]) & (df['YEAR'] <= year_tuple[1])]
    df_period = tmp_df[['pubnum','cpcsc','weight']].groupby(['pubnum','cpcsc']).sum()
    df_period.reset_index(inplace = True)
    return df_period

In [9]:
df_in_periods = []
for year_tuple in tqdm(year_ranges):
    df_period = filter_and_group_data(df_full, year_tuple)
    df_in_periods.append(df_period)

100%|██████████| 10/10 [00:15<00:00,  1.59s/it]


In [10]:
for i in range(len(df_in_periods)):
    print(f"{year_ranges[i]} no of nodes : {len(df_in_periods[i]['cpcsc'].unique())}")

(2013.0, 2013.0) no of nodes : 658
(2014.0, 2014.0) no of nodes : 660
(2015.0, 2015.0) no of nodes : 659
(2016.0, 2016.0) no of nodes : 660
(2017.0, 2017.0) no of nodes : 660
(2018.0, 2018.0) no of nodes : 662
(2019.0, 2019.0) no of nodes : 659
(2020.0, 2020.0) no of nodes : 664
(2021.0, 2021.0) no of nodes : 661
(2022.0, 2022.0) no of nodes : 658


In [14]:
# from CPC - publication matrix to CPC co-allocation network
## Convert 2-mode network to 1-mode network accounting for weights
def bipartite_to_unipartite_weighted(B, nodes_set):
    G = nx.Graph()
    G.add_nodes_from(nodes_set)
    
    for u in nodes_set:
        for v in nodes_set:
            if u != v:
                shared_neighbors = set(B.neighbors(u)) & set(B.neighbors(v))
                if len(shared_neighbors) > 0:
                    weight = sum(B[u][neighbor]['weight'] * B[v][neighbor]['weight'] for neighbor in shared_neighbors)
                    G.add_edge(u, v, weight=weight)
    return G

In [15]:
len(df_in_periods)

10

In [16]:
# result_list의 데이터프레임을 받아서 graph_list 출력

graph_list = []

for df_period in tqdm(df_in_periods):

    # Create a new, empty graph
    B1 = nx.Graph()
    # Define the sets for the two partitions of the bipartite graph
    partition_A = set()
    partition_B = set()

    # dataset
    df = df_period[['cpcsc','pubnum','weight']]
    df.shape

    # Add the weighted edges to the graph and update the partition sets
    for index, row in df.iterrows():
        B1.add_edge(row['cpcsc'], row['pubnum'], weight=row['weight'])
        partition_A.add(row['cpcsc'])
        partition_B.add(row['pubnum'])

    # Create 1-mode networks
    G11 = bipartite_to_unipartite_weighted(B1, partition_A)

    print(f' G11 the number_of_nodes : {G11.number_of_nodes()}')
    print(f' G11 the number_of_edges : {G11.number_of_edges()}')
    
    graph_list.append(G11)

#일단 아래 파일 출력은 중지 (230829)
# # Export the graph to a GEXF file
# nx.write_gexf(G11, f"cpcmg_co-alloc_{year_start}-{year_end}.gexf")
# print(f"Exported a Gephi input file for the period: {year_start}-{year_end} ")

# # Export the graph to VOSviewer-compatible files
# export_to_vosviewer(G11, f"cpcmg_co-alloc_{year_start}-{year_end}_network.txt", f"cpcmg_co-alloc_{year_start}-{year_end}_map.txt")
# print(f"Exported a Vosviewer input file for the period: {year_start}-{year_end} ")

 10%|█         | 1/10 [01:35<14:19, 95.53s/it]

 G11 the number_of_nodes : 658
 G11 the number_of_edges : 30059


 20%|██        | 2/10 [03:21<13:33, 101.72s/it]

 G11 the number_of_nodes : 660
 G11 the number_of_edges : 34199


 30%|███       | 3/10 [05:13<12:24, 106.31s/it]

 G11 the number_of_nodes : 659
 G11 the number_of_edges : 38338


 40%|████      | 4/10 [07:11<11:04, 110.79s/it]

 G11 the number_of_nodes : 660
 G11 the number_of_edges : 41264


 50%|█████     | 5/10 [09:23<09:52, 118.47s/it]

 G11 the number_of_nodes : 660
 G11 the number_of_edges : 43607


 60%|██████    | 6/10 [11:32<08:08, 122.03s/it]

 G11 the number_of_nodes : 662
 G11 the number_of_edges : 44680


 70%|███████   | 7/10 [14:05<06:36, 132.27s/it]

 G11 the number_of_nodes : 659
 G11 the number_of_edges : 49075


 80%|████████  | 8/10 [16:39<04:38, 139.09s/it]

 G11 the number_of_nodes : 664
 G11 the number_of_edges : 49378


 90%|█████████ | 9/10 [18:58<02:19, 139.04s/it]

 G11 the number_of_nodes : 661
 G11 the number_of_edges : 47281


100%|██████████| 10/10 [21:10<00:00, 127.03s/it]

 G11 the number_of_nodes : 658
 G11 the number_of_edges : 45957





In [17]:
graph_list[0]

<networkx.classes.graph.Graph at 0x2e770bc16d0>

In [None]:
# Shannon entropy 계산 함수 (weight 미고려)

#def shannon_entropy(graph):
#    nodes = graph.nodes()
#    num_nodes = len(nodes)
#    max_edges = num_nodes * (num_nodes - 1) / 2
#    actual_edges = len(graph.edges())
#    p = actual_edges / max_edges
#    if p == 0 or p == 1:
#        return 0
#    entropy = - (p * math.log2(p) + (1 - p) * math.log2(1 - p))
#    return entropy

In [18]:
# 섀넌 엔트로피 계산 (수정, weight고려)

def shannon_entropy(G):
    degrees = np.array([d for n, d in G.degree(weight='weight')])
    degrees_prob = degrees / degrees.sum()
    entropy = degrees_prob * np.log2(degrees_prob)
    entropy_value = -np.sum(np.nan_to_num(entropy))
    return entropy_value

In [19]:
# 두 그래프에서 엔트로피 변화 기여율 높은 노드 찾기

def entropy_change_cal(G1, G2):

    # 두 그래프의 노드를 맞춰주기
    # G1에는 G2에는 있고 G1에는 없는 노드를 추가하여 G11 생성
    G11 = G1.copy()
    G11.add_nodes_from(G2.nodes - G1.nodes)
    # G2에는 G1에는 있고 G2에는 없는 노드를 추가하여 G22 생성
    G22 = G2.copy()
    G22.add_nodes_from(G1.nodes - G2.nodes)
    
    # G11과 G22의 초기 엔트로피 계산.
    initial_entropy_G11 = shannon_entropy(G11)
    initial_entropy_G22 = shannon_entropy(G22)

    # G1에서 G2로의 엔트로피 변화 계산
    entropy_changes = []
    for node in tqdm(G11.nodes):
        if node in G22.nodes:
            # 노드를 제거하고 엔트로피 계산
            G11_removed = G11.copy()
            G11_removed.remove_node(node)
            removed_entropy_G11 = shannon_entropy(G11_removed)

            G22_removed = G22.copy()
            G22_removed.remove_node(node)
            removed_entropy_G22 = shannon_entropy(G22_removed)

            # G1에서 G2로의 엔트로피 변화 계산
            entropy_change = (initial_entropy_G22 - removed_entropy_G22) - (initial_entropy_G11 - removed_entropy_G11)
            entropy_changes.append((node, entropy_change))
                        
    return entropy_changes

In [20]:
# 새 계산 230907
entropy_changes_list = []
for i in range(len(graph_list)-1):
    entropy_changes = entropy_change_cal(graph_list[i], graph_list[i+1])
    entropy_changes_list.append(entropy_changes)

  entropy = degrees_prob * np.log2(degrees_prob)
  entropy = degrees_prob * np.log2(degrees_prob)
100%|██████████| 663/663 [03:16<00:00,  3.38it/s]
100%|██████████| 665/665 [03:49<00:00,  2.90it/s]
100%|██████████| 666/666 [04:08<00:00,  2.68it/s]
100%|██████████| 665/665 [04:26<00:00,  2.50it/s]
100%|██████████| 667/667 [04:33<00:00,  2.44it/s]
100%|██████████| 667/667 [04:54<00:00,  2.27it/s]
100%|██████████| 667/667 [05:08<00:00,  2.16it/s]
100%|██████████| 667/667 [05:00<00:00,  2.22it/s]
100%|██████████| 664/664 [04:46<00:00,  2.31it/s]


In [18]:
#entropy_changes_list = []
#for i in range(len(graph_list)-1):
#    entropy_changes = entropy_change_cal(graph_list[i], graph_list[i+1])
#    entropy_changes_list.append(entropy_changes)

100%|██████████| 663/663 [03:12<00:00,  3.45it/s]
100%|██████████| 665/665 [03:42<00:00,  2.99it/s]
100%|██████████| 666/666 [03:58<00:00,  2.79it/s]
100%|██████████| 665/665 [04:13<00:00,  2.62it/s]
100%|██████████| 667/667 [04:22<00:00,  2.54it/s]
100%|██████████| 667/667 [04:40<00:00,  2.38it/s]
100%|██████████| 667/667 [04:49<00:00,  2.30it/s]
100%|██████████| 667/667 [04:46<00:00,  2.32it/s]
100%|██████████| 664/664 [04:36<00:00,  2.40it/s]


In [21]:
entropy_changes_df_list = []
for i in range(len(entropy_changes_list)):
    entropy_changes_df_list.append(pd.DataFrame(entropy_changes_list[i], columns=['cpcsc', f'ep_change_{i}']))

In [22]:
##
entropy_changes_df = reduce(lambda left,right: pd.merge(left,right,on='cpcsc'), entropy_changes_df_list)

In [23]:
entropy_changes_df.set_index('cpcsc', inplace = True)

In [24]:
# 각 열별로 상위 n개의 값을 찾아서 해당 행의 인덱스 추출
top_index = entropy_changes_df.apply(lambda x: x.nlargest(10).index)

# 이들을 다시 동일한 열 이름을 가진 데이터프레임으로 생성
top_index_df = pd.DataFrame(top_index)


In [25]:
# 새 계산 230907
top_index_df

Unnamed: 0,ep_change_0,ep_change_1,ep_change_2,ep_change_3,ep_change_4,ep_change_5,ep_change_6,ep_change_7,ep_change_8
0,Y10T,Y10T,Y10T,Y10T,A61P,A61K,H04B,H04N,A61K
1,A61P,A61P,A61P,H01L,Y10T,A61P,H01L,H01L,A61P
2,C07D,A61K,A61K,B64U,H04M,H04W,G02F,H04W,H04N
3,A61K,C07D,C07D,F25B,A61K,H04M,A61K,G02B,H04L
4,Y10S,B32B,H01M,A61P,A24F,H01L,G02B,H04M,H01L
5,G01N,F16M,B65D,G06Q,F05D,C07D,H04M,G02F,H01M
6,C07K,F16K,G01N,Y02D,C07D,H05K,H04W,H04B,G02B
7,E04B,A47C,B32B,B60Y,Y02D,C12N,H01M,G01N,C07K
8,B41J,H04M,F16K,A61K,H01L,C07K,G01N,G09G,C07D
9,H05K,E05B,Y02E,F05D,B33Y,B32B,A61P,B32B,F05D


In [42]:
#top_index_df

Unnamed: 0,ep_change_0,ep_change_1,ep_change_2,ep_change_3,ep_change_4,ep_change_5,ep_change_6,ep_change_7,ep_change_8
0,C12M,H01B,A47K,H04W,G08B,B23D,G06T,G06N,G06N
1,A47B,B21C,A01M,G04C,F16P,B61B,B65G,A01D,A45D
2,B62B,B65G,A01C,B64U,B60P,A46B,B02C,B44D,D01B
3,F03B,B23Q,G05B,B68C,A41B,F04D,B60H,H01M,G10K
4,B23P,F26B,B60J,E01H,H04W,F03C,G06N,B21D,G04G
5,B26D,B66F,G01V,G01P,G06N,A63G,B25D,H03H,A43D
6,B60P,F16J,A47G,B60Y,F01M,C12Y,F23K,E03C,F16D
7,F16N,F15B,G10K,C09D,F17C,G01H,B60S,B04C,A01B
8,B04B,F16M,A23L,A63F,B23H,F16H,B60J,H10B,B23C
9,B64U,B65B,C09F,H03K,B33Y,B27B,B22F,H01T,A24B


In [None]:
#각 서브 플롯을 vosviewer 포맷으로 출력. gephi 포맷으로. 그리고.. km???도???