In [1]:
import os
import collections
import random
import tqdm
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import defaultdict, Counter
from sklearn.decomposition import PCA

In [2]:
def read_adj_file(adj_file_path):
    with open(adj_file_path, "r") as file:
        adj_list = [list(map(int, line.split())) for line in file.readlines()]
    return adj_list


def read_degree_file(degree_file_path):
    with open(degree_file_path, "r") as file:
        lines = file.readlines()
        fixed_value = int(lines[0].strip())
        num_query_type = int(lines[1].strip())
        m = int(lines[2].strip())
        degrees = [int(line.strip()) for line in lines[3:]]
    return fixed_value, num_query_type, m, degrees


def count_degree_distribution(degrees):
    degree_counter = collections.Counter(degrees)
    return degree_counter


base_path = ""
path_homo = ""
adj_file_path = os.path.join(path_homo, "adj.txt")
degree_file_path = os.path.join(path_homo, "degree.txt")

adj_list = read_adj_file(adj_file_path)
fixed_value, num_query_type, m, degrees = read_degree_file(degree_file_path)
non_zero_degree_count = sum(1 for degree in degrees if degree > 0)

print("Adjacency List:")
print(adj_list[:10])
print(degrees[:10])
print(len(degrees))

print("\nDegree Information:")
print(f"Fixed Value: {fixed_value}")
print(f"Number of Query Types: {num_query_type}")
print(f"m: {m}")
# print(f"Degrees: {degrees}")
degree_counter = count_degree_distribution(degrees)
print(degree_counter)
non_zero_degree_count

Adjacency List:
[[93, 194, 260, 298, 429, 522, 861, 1444, 1823, 1825, 2070, 2088, 2349, 5136, 5315, 5316, 5494, 5802, 11166, 11167, 11168], [1035, 1123, 2362, 2540, 4087, 4088, 4581, 4582, 6069, 7049, 7284, 7477, 7696, 7804, 7856, 8206, 8248, 8286, 9014, 9038, 9654, 9842, 10712, 11343, 12482], [2642], [4329], [41, 1979, 3752, 3753, 6709, 6710], [437, 659, 2017, 2845, 3877, 3878, 3879, 4728], [82, 391, 1979, 6112], [145, 680, 948, 2861, 3695, 4847, 5084, 5085, 6095, 6120, 11619], [38, 1145, 1542, 1689, 7111, 7852, 7922, 12919, 18272, 18719, 19157, 19158, 19159], [64, 707, 2605, 2689, 4823, 6851, 7220, 8536, 8537, 8643, 12614, 13101, 13482, 13561, 14395, 14457, 18253, 18681, 20560]]
[21, 25, 1, 1, 6, 8, 4, 11, 13, 19]
21164

Degree Information:
Fixed Value: 4
Number of Query Types: 21164
m: 109724
Counter({2: 3441, 3: 3273, 4: 2641, 1: 2570, 5: 2106, 6: 1330, 0: 995, 7: 972, 8: 681, 9: 571, 10: 451, 11: 346, 12: 252, 13: 208, 15: 166, 14: 161, 16: 126, 17: 86, 18: 85, 19: 78, 20: 72, 21:

20169

In [3]:
def load_graph(data_dir):
    print("Start load graph")

    graph_info_path = os.path.join(data_dir, "graph_info.txt")
    with open(graph_info_path, "r") as f:
        lines = f.readlines()

    m, n, n_types = map(int, lines[0].split())
    hin_schema_adjacencyMatrix = [[0] * n_types for _ in range(n_types)]
    hin_schema_edge_cnt = [[0] * n_types for _ in range(n_types)]
    vertex_start_map = list(map(int, lines[1].split()))

    line_index = 2
    for i in range(n_types):
        hin_schema_adjacencyMatrix[i] = list(map(int, lines[line_index].split()))
        line_index += 1

    for i in range(n_types):
        hin_schema_edge_cnt[i] = list(map(int, lines[line_index].split()))
        line_index += 1

    print(m, n, n_types)

    vertex_offset = [0] * n
    edges = []

    graph_type_path = os.path.join(data_dir, "graph_type.txt")
    with open(graph_type_path, "r") as f:
        for line in f:
            v_id, v_type, edge_type = map(int, line.split())
            edges.append({"v_id": v_id, "v_type": v_type, "edge_type": edge_type})

    vertex_offset_path = os.path.join(data_dir, "graph_offset.txt")
    with open(vertex_offset_path, "r") as f:
        for i, line in enumerate(f):
            vertex_offset[i] = int(line.strip())

    print("Finish read graph")
    return {
        "m": m,
        "n": n,
        "n_types": n_types,
        "hin_schema_adjacencyMatrix": hin_schema_adjacencyMatrix,
        "hin_schema_edge_cnt": hin_schema_edge_cnt,
        "vertex_start_map": vertex_start_map,
        "vertex_offset": vertex_offset,
        "edges": edges,
    }


# Example usage:
data_dir = ""
graph_data = load_graph(data_dir)


Start load graph
439924 54291 4
Finish read graph


In [4]:
graph_data.keys()

edges = graph_data["edges"]
vertex_offset = graph_data["vertex_offset"]
n = graph_data["n"]
m = graph_data["m"]
vertex_start_map = graph_data["vertex_start_map"]
query_type_offset = vertex_start_map[1]

vertex_start_map[1], vertex_start_map[2]

(19316, 40480)

In [5]:
def get_nei_list(vertex_i):
    nei_list = []
    nei_edge_start = vertex_offset[vertex_i]
    nei_end = m if (vertex_i + 1) == n else vertex_offset[vertex_i + 1]

    for j in range(nei_edge_start, nei_end):
        nei_type = edges[j]["v_type"]
        nei_id = edges[j]["v_id"]
        nei_list.append({"nei_type": nei_type, "nei_id": nei_id})

    return nei_list


def get_author_venue_topic(author_id):
    nei_list = get_nei_list(author_id)
    venue_list = []
    topic_list = []

    for nei in nei_list:
        nei_type = nei["nei_type"]
        nei_id = nei["nei_id"]
        if nei_type == 0:
            # paper
            paper_nei = get_nei_list(nei_id)
            for p_nei in paper_nei:
                pn_type = p_nei["nei_type"]
                pn_id = p_nei["nei_id"]
                if pn_type == 2:
                    # venue
                    venue_list.append(pn_id)
                if pn_type == 3:
                    # topic
                    topic_list.append(pn_id)
    venue_list = sorted(venue_list)
    topic_list = sorted(list(set(topic_list)))
    return venue_list, topic_list


vertex_start_map[1], vertex_start_map[2], vertex_start_map[3]

(19316, 40480, 40496)

In [6]:
author_l_v = [{} for _ in range(vertex_start_map[2] - vertex_start_map[1])]

for author_id in range(vertex_start_map[1], vertex_start_map[2]):
    ven_list, t_list = get_author_venue_topic(author_id)
    au_i = author_id - vertex_start_map[1]
    author_l_v[au_i]["venue_list"] = ven_list
    author_l_v[au_i]["topic_list"] = t_list

author_l_v[0]

{'venue_list': [40480, 40480, 40480, 40480, 40480, 40481, 40482, 40482, 40482],
 'topic_list': [40496,
  40499,
  40504,
  40507,
  40508,
  40510,
  40511,
  40515,
  40520,
  40536,
  40559,
  40565,
  40603,
  40654,
  40661,
  40712,
  40747,
  40750,
  40784,
  40853,
  40964,
  41003,
  41036,
  41048,
  41140,
  41266,
  41388,
  41534,
  41576,
  41612,
  42100,
  42651,
  42735,
  43104,
  43697,
  43883,
  43884,
  43929,
  44417,
  44525]}

In [7]:
venue_df_path = ""
venue_df = pd.read_csv(venue_df_path, delimiter="\t", header=None)
venue_df.columns = ["label", "venue_index", "venue_raw"]
venue_df["new_id"] = venue_df["venue_index"].apply(lambda x: x + vertex_start_map[2])
venue_df

Unnamed: 0,label,venue_index,venue_raw,new_id
0,CV,0,CVPR,40480
1,AI,1,NeurIPS,40481
2,CV,2,ICCV,40482
3,AI,3,ICML,40483
4,AI,5,AAAI,40485
5,NLP,6,ACL 2020,40486
6,NLP,7,EMNLP/IJCNLP (1),40487
7,CHI,9,Proceedings of the 2019 CHI Conference on Huma...,40489
8,DB,12,ACM Knowledge Discovery and Data Mining,40492
9,DB,20,ACM SIGMOD Record,40500


In [8]:
# Create a mapping from venue id to label
venue_id_to_label = dict(zip(venue_df["new_id"], venue_df["label"]))

for author_id in range(vertex_start_map[1], vertex_start_map[2]):
    au_i = author_id - vertex_start_map[1]
    ven_list = author_l_v[au_i]["venue_list"]
    venue_labels = [
        venue_id_to_label[ven_id] for ven_id in ven_list if ven_id in venue_id_to_label
    ]
    author_l_v[au_i]["venue_label"] = venue_labels

    # Find the most frequent label and its proportion
    if venue_labels:
        max_label = max(set(venue_labels), key=venue_labels.count)
        max_label_count = venue_labels.count(max_label)
        max_label_proportion = round(max_label_count / len(venue_labels), 2)
    else:
        max_label = None
        max_label_proportion = 0

    # Add the new columns to the dictionary
    author_l_v[au_i]["max_label"] = max_label
    author_l_v[au_i]["max_label_proportion"] = max_label_proportion

author_l_v[0]

{'venue_list': [40480, 40480, 40480, 40480, 40480, 40481, 40482, 40482, 40482],
 'topic_list': [40496,
  40499,
  40504,
  40507,
  40508,
  40510,
  40511,
  40515,
  40520,
  40536,
  40559,
  40565,
  40603,
  40654,
  40661,
  40712,
  40747,
  40750,
  40784,
  40853,
  40964,
  41003,
  41036,
  41048,
  41140,
  41266,
  41388,
  41534,
  41576,
  41612,
  42100,
  42651,
  42735,
  43104,
  43697,
  43883,
  43884,
  43929,
  44417,
  44525],
 'venue_label': ['CV', 'CV', 'CV', 'CV', 'CV', 'AI', 'CV', 'CV', 'CV'],
 'max_label': 'CV',
 'max_label_proportion': 0.89}

In [9]:
topic_path = ""
topic_df = pd.read_csv(topic_path, sep="\t", index_col=None)
print(topic_df.head(3))
au_l_df = pd.DataFrame(author_l_v)
au_l_df["new_id"] = au_l_df.index + vertex_start_map[1]
print(au_l_df[au_l_df["max_label"].isnull()].shape)
au_l_df.head(3)

   fos_index  id                       fos   ResNum  finished  new_id
0          0   0  Iterative reconstruction  1190000         1   40496
1          1   1           Computer vision  4040000         1   40497
2          2   2                 Statistic  4030000         1   40498
(3800, 6)


Unnamed: 0,venue_list,topic_list,venue_label,max_label,max_label_proportion,new_id
0,"[40480, 40480, 40480, 40480, 40480, 40481, 404...","[40496, 40499, 40504, 40507, 40508, 40510, 405...","[CV, CV, CV, CV, CV, AI, CV, CV, CV]",CV,0.89,19316
1,"[40480, 40481, 40481, 40481, 40481, 40481, 404...","[40499, 40504, 40505, 40508, 40509, 40510, 405...","[CV, AI, AI, AI, AI, AI, AI, AI, AI, AI, AI, NLP]",AI,0.83,19317
2,[40480],"[40504, 40515, 40571, 40650, 40728, 41047, 411...",[CV],CV,1.0,19318


In [11]:
au_path = ""
author_df = pd.read_csv(au_path, sep="\t")
print(author_df.shape)
# combine
merged_au_df = pd.merge(author_df, au_l_df, on="new_id", how="inner")

# trans degrees into DataFrame 
degrees_df = pd.DataFrame(degrees, columns=["degree"])
merged_au_df = pd.concat([merged_au_df, degrees_df], axis=1)

merged_au_df.head(3)

(21164, 5)


Unnamed: 0,author_index,name,hindex,org,new_id,venue_list,topic_list,venue_label,max_label,max_label_proportion,degree
0,0,Carsten Rother,71,"Microsoft Research Cambridge, UK",19316,"[40480, 40480, 40480, 40480, 40480, 40481, 404...","[40496, 40499, 40504, 40507, 40508, 40510, 405...","[CV, CV, CV, CV, CV, AI, CV, CV, CV]",CV,0.89,21
1,1,Sanjiv Kumar,33,Carnegie Mellon University,19317,"[40480, 40481, 40481, 40481, 40481, 40481, 404...","[40499, 40504, 40505, 40508, 40509, 40510, 405...","[CV, AI, AI, AI, AI, AI, AI, AI, AI, AI, AI, NLP]",AI,0.83,25
2,2,Vladimir Kolmogorov,43,Microsoft Research,19318,[40480],"[40504, 40515, 40571, 40650, 40728, 41047, 411...",[CV],CV,1.0,1


In [17]:
print(merged_au_df.shape)
print(merged_au_df[merged_au_df["degree"] <= 2].shape)
print(merged_au_df[merged_au_df["hindex"] <= 2].shape)
print(merged_au_df[(merged_au_df["hindex"] <= 2) & (merged_au_df["degree"] <= 2)].shape)
print(merged_au_df[(merged_au_df["hindex"] <= 2) | (merged_au_df["degree"] <= 2)].shape)
print(
    merged_au_df[
        (
            (merged_au_df["hindex"] <= 2) ^ (merged_au_df["hindex"] <= 2)
            & (merged_au_df["degree"] <= 2)
        )
    ].shape
)
print("degree<=1")
print(merged_au_df[merged_au_df["degree"] <= 1].shape)
print(merged_au_df[(merged_au_df["hindex"] <= 2) & (merged_au_df["degree"] <= 1)].shape)
print(merged_au_df[(merged_au_df["hindex"] <= 2) | (merged_au_df["degree"] <= 1)].shape)

(21164, 11)
(7006, 11)
(5850, 11)
(2007, 11)
(10849, 11)
(3843, 11)
degree<=1
(3565, 11)
(855, 11)
(8560, 11)


In [35]:

m_au_df_save = merged_au_df.copy()
m_au_df_save["label"] = np.nan

# condition 1: degree <= 2
condition1 = m_au_df_save["degree"] <= 2
m_au_df_save.loc[condition1, "label"] = "o"
print(m_au_df_save[m_au_df_save["label"] == "o"].shape)

# condition 2: hindex <= 2 ^ degree <= 1
condition2 = (m_au_df_save["hindex"] <= 2) ^ (m_au_df_save["hindex"] <= 2) & (m_au_df_save["degree"] <= 2)
indices = m_au_df_save[condition2].index
sampled_indices = np.random.choice(
    indices, size=int(len(indices) * 0.20), replace=False
)
m_au_df_save.loc[sampled_indices, "label"] = "o"

print(m_au_df_save.shape)
print(m_au_df_save[m_au_df_save["label"] == "o"].shape)

author_rem_df = m_au_df_save[m_au_df_save['label'].isnull()]
print(author_rem_df.shape)
m_au_df_save.head(3)

(7006, 12)
(21164, 12)
(7774, 12)
(13390, 12)


Unnamed: 0,author_index,name,hindex,org,new_id,venue_list,topic_list,venue_label,max_label,max_label_proportion,degree,label
0,0,Carsten Rother,71,"Microsoft Research Cambridge, UK",19316,"[40480, 40480, 40480, 40480, 40480, 40481, 404...","[40496, 40499, 40504, 40507, 40508, 40510, 405...","[CV, CV, CV, CV, CV, AI, CV, CV, CV]",CV,0.89,21,
1,1,Sanjiv Kumar,33,Carnegie Mellon University,19317,"[40480, 40481, 40481, 40481, 40481, 40481, 404...","[40499, 40504, 40505, 40508, 40509, 40510, 405...","[CV, AI, AI, AI, AI, AI, AI, AI, AI, AI, AI, NLP]",AI,0.83,25,
2,2,Vladimir Kolmogorov,43,Microsoft Research,19318,[40480],"[40504, 40515, 40571, 40650, 40728, 41047, 411...",[CV],CV,1.0,1,o


In [64]:
def merge_topic_lists(group):
    merged_topics = []
    for topics in group["topic_list"]:
        merged_topics.extend(topics)
    return sorted(list(set(merged_topics)))  


v_labels = author_rem_df["max_label"].unique()
print(author_rem_df[author_rem_df["max_label"].isnull()].shape)
print(v_labels)
v_clu_ts = defaultdict(list)

for v in v_labels:
    if v is None:
        tmp = author_rem_df[author_rem_df["max_label"].isnull()]
    else:
        tmp = author_rem_df[author_rem_df["max_label"] == v]
    print(f"{v} : {tmp.shape}   ", end="\t")
    v_tmp = merge_topic_lists(tmp)
    print(f"{v} : {len(v_tmp)}")
    if v is None:
        v_clu_ts["None"] = v_tmp
    else:    
        v_clu_ts[v] = v_tmp
a,b,c,d,e,f=10,10,10,10,10,10
v_clu_num = {"CV": a, "AI": b, "None": c, "NLP": d, "CHI": e, "DB": f}
print(v_clu_num)
v_clu_ts["CV"][:5]
v_clu_ts["None"][:5]

(2123, 14)
['CV' 'AI' None 'CHI' 'NLP' 'DB']
CV : (4618, 14)   	CV : 4632
AI : (4392, 14)   	AI : 5838
None : (2123, 14)   	None : 2929
CHI : (300, 14)   	CHI : 1570
NLP : (1785, 14)   	NLP : 5876
DB : (172, 14)   	DB : 1024
{'CV': 30, 'AI': 29, 'None': 14, 'NLP': 11, 'CHI': 4, 'DB': 4}


[40497, 40498, 40499, 40503, 40504]

In [65]:
import torch
from transformers import BertModel, BertTokenizer
from sklearn.cluster import KMeans
from collections import defaultdict
import numpy as np

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = ""
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)


# 定义一个函数来计算 BERT 嵌入
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(
            text, return_tensors="pt", truncation=True, padding=True, max_length=512
        ).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # 获取 [CLS] token 的嵌入
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
        embeddings.append(cls_embedding)
    return np.array(embeddings)


final_clusters = {}

# clustering topics
for v_label, num_clusters in v_clu_num.items():
    print(f"Clustering topics for {v_label} into {num_clusters} clusters.")

    
    topics = v_clu_ts[v_label]
    fos_names = topic_df[topic_df["new_id"].isin(topics)]["fos"].tolist()
    print(len(fos_names))

    X = get_bert_embeddings(fos_names)

    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    kmeans.fit(X)
    labels = kmeans.labels_

    cluster_topics = defaultdict(list)
    for topic, label in zip(topics, labels):
        cluster_topics[label].append(topic)

    final_clusters[v_label] = cluster_topics



Clustering topics for CV into 30 clusters.
4632
Clustering topics for AI into 29 clusters.
5838
Clustering topics for None into 14 clusters.
2929
Clustering topics for NLP into 11 clusters.
5876
Clustering topics for CHI into 4 clusters.
1570
Clustering topics for DB into 4 clusters.
1024


In [66]:
cluster_id_counter = 0
final_clusters_renamed = {}

for v_label, clusters in final_clusters.items():
    final_clusters_renamed[v_label] = {}
    print(f"Category: {v_label}")
    cnt_print = 0
    for cluster_id, topics in clusters.items():
        final_clusters_renamed[v_label][cluster_id_counter] = topics
        cluster_id_counter += 1
        fos_names = topic_df[topic_df["new_id"].isin(topics)]["fos"].tolist()[:2]
        cnt_print += 1
        if cnt_print < 5:   
            print(f" Cluster {cluster_id_counter}: {fos_names}")

final_clusters = final_clusters_renamed


Category: CV
 Cluster 1: ['Iterative reconstruction', 'Feature extraction']
 Cluster 2: ['Computer vision', 'Image retrieval']
 Cluster 3: ['Statistic', 'Similarity measure']
 Cluster 4: ['Pattern recognition', 'Object detection']
Category: AI
 Cluster 31: ['Iterative reconstruction', 'Support vector machine']
 Cluster 32: ['Computer vision', 'Pattern recognition']
 Cluster 33: ['Statistic', 'Estimator']
 Cluster 34: ['Modulo', 'Detector']
Category: None
 Cluster 60: ['Computer vision', 'Computer science']
 Cluster 61: ['Statistic', 'Background subtraction']
 Cluster 62: ['Pattern recognition', 'Object detection']
 Cluster 63: ['Invariant (mathematics)', 'Robustness (computer science)']
Category: NLP
 Cluster 74: ['Iterative reconstruction', 'Support vector machine']
 Cluster 75: ['Computer vision', 'Computer science']
 Cluster 76: ['Pattern recognition', 'Artificial intelligence']
 Cluster 77: ['Modulo', 'Detector']
Category: CHI
 Cluster 85: ['Iterative reconstruction', 'Invariant (m

In [67]:
final_clusters_by_label = {label: {} for label in final_clusters}
for label, clusters in final_clusters.items():
    for cluster_id, topics in clusters.items():
        for topic in topics:
            final_clusters_by_label[label][topic] = cluster_id


def find_max_v_clu_and_prop(row):
    max_label = row["max_label"]
    if pd.isna(max_label):
        topic_to_cluster = final_clusters_by_label["None"]    
    else:
        topic_to_cluster = final_clusters_by_label[max_label]

    topic_list = row["topic_list"]
    cluster_counter = Counter()

    for topic in topic_list:
        if topic in topic_to_cluster:
            cluster_id = topic_to_cluster[topic]
            cluster_counter[cluster_id] += 1

    if not cluster_counter:
        return pd.Series([None, 0], index=["max_v_clu", "max_v_clu_prop"])

    max_v_clu, max_count = cluster_counter.most_common(1)[0]
    max_v_clu_prop = round(max_count / len(topic_list), 2)

    return pd.Series([max_v_clu, max_v_clu_prop], index=["max_v_clu", "max_v_clu_prop"])


author_rem_df[["max_v_clu", "max_v_clu_prop"]] = author_rem_df.apply(find_max_v_clu_and_prop, axis=1)
print(author_rem_df[author_rem_df['max_v_clu'].isnull()].shape)

author_rem_df[author_rem_df['max_v_clu'].isnull()].head(3)

(100, 14)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  author_rem_df[["max_v_clu", "max_v_clu_prop"]] = author_rem_df.apply(find_max_v_clu_and_prop, axis=1)


Unnamed: 0,author_index,name,hindex,org,new_id,venue_list,topic_list,venue_label,max_label,max_label_proportion,degree,label,max_v_clu,max_v_clu_prop
678,678,Mário A. T. Figueiredo,55,University of Wisconsin-Madison|Electrical and...,19994,[40481],[],[AI],AI,1.0,4,,,0.0
1740,1740,Venice Erin Liong,17,,21056,[40480],[],[CV],CV,1.0,6,,,0.0
1886,1886,Sergio Guadarrama,25,,21202,[40481],[],[AI],AI,1.0,5,,,0.0


In [69]:
max_v_clu_max_value = author_rem_df['max_v_clu'].max(skipna=True)  
next_label_value = max_v_clu_max_value + 1  

author_rem_df.loc[author_rem_df['max_v_clu'].isnull(), 'label'] = next_label_value

author_rem_df.loc[author_rem_df['max_v_clu'].notnull(), 'label'] = author_rem_df['max_v_clu']

print(author_rem_df[author_rem_df['max_v_clu'].isnull()].shape)
print(author_rem_df[author_rem_df['label'].isnull()].shape)
author_rem_df.head(3)

(100, 14)
(0, 14)


Unnamed: 0,author_index,name,hindex,org,new_id,venue_list,topic_list,venue_label,max_label,max_label_proportion,degree,label,max_v_clu,max_v_clu_prop
0,0,Carsten Rother,71,"Microsoft Research Cambridge, UK",19316,"[40480, 40480, 40480, 40480, 40480, 40481, 404...","[40496, 40499, 40504, 40507, 40508, 40510, 405...","[CV, CV, CV, CV, CV, AI, CV, CV, CV]",CV,0.89,21,4.0,4.0,0.12
1,1,Sanjiv Kumar,33,Carnegie Mellon University,19317,"[40480, 40481, 40481, 40481, 40481, 40481, 404...","[40499, 40504, 40505, 40508, 40509, 40510, 405...","[CV, AI, AI, AI, AI, AI, AI, AI, AI, AI, AI, NLP]",AI,0.83,25,43.0,43.0,0.11
4,4,Ming Yang,38,,19320,"[40480, 40480]","[40497, 40499, 40504, 40507, 40515, 40524, 405...","[CV, CV]",CV,1.0,6,3.0,3.0,0.21


In [71]:
for index, row in author_rem_df.iterrows():
    new_id = row['new_id']
    label = row['label']
    m_au_df_save.loc[m_au_df_save['new_id'] == new_id, 'label'] = label

print(m_au_df_save[m_au_df_save['label'].isnull()].shape)
m_au_df_save.head(3)

(0, 12)


Unnamed: 0,author_index,name,hindex,org,new_id,venue_list,topic_list,venue_label,max_label,max_label_proportion,degree,label
0,0,Carsten Rother,71,"Microsoft Research Cambridge, UK",19316,"[40480, 40480, 40480, 40480, 40480, 40481, 404...","[40496, 40499, 40504, 40507, 40508, 40510, 405...","[CV, CV, CV, CV, CV, AI, CV, CV, CV]",CV,0.89,21,4.0
1,1,Sanjiv Kumar,33,Carnegie Mellon University,19317,"[40480, 40481, 40481, 40481, 40481, 40481, 404...","[40499, 40504, 40505, 40508, 40509, 40510, 405...","[CV, AI, AI, AI, AI, AI, AI, AI, AI, AI, AI, NLP]",AI,0.83,25,43.0
2,2,Vladimir Kolmogorov,43,Microsoft Research,19318,[40480],"[40504, 40515, 40571, 40650, 40728, 41047, 411...",[CV],CV,1.0,1,o


In [72]:
m_au_df_save_s_path = ""
m_au_df_save_s = m_au_df_save[["new_id", "label"]]
m_au_df_save_s.to_csv(m_au_df_save_s_path, sep=' ', index=False, header=False)
m_au_df_save_s.head(3)

Unnamed: 0,new_id,label
0,19316,4.0
1,19317,43.0
2,19318,o
