In [None]:
import os
from typing import Literal
from histcite import ReadFile
from histcite import ProcessFile
from histcite import ComputeMetrics
from histcite import GraphViz

#### 输入文件夹路径及数据源类型

In [None]:
source_type: Literal["wos", "cssci", "scopus"]
folder_path = "/Users/.../Downloads/dataset"
source_type = "wos"

#### 读取并处理文件

In [None]:
docs_df = ReadFile(folder_path, source_type).read_all()  # 读取文件
docs_df

In [None]:
process = ProcessFile(docs_df, source_type)
refs_df = process.extract_reference()  # 提取参考文献
citation_relationship = process.process_citation(refs_df)  # 识别引用关系

#### 导出描述性统计数据

In [None]:
cm = ComputeMetrics(docs_df, citation_relationship, source_type)
cm.write2excel(os.path.join(folder_path, "result", "descriptive_statistics.xlsx"))

#### 导出引文网络图文件
图文件(后缀为.dot)可以使用 [Graphviz在线编辑器](http://magjac.com/graphviz-visual-editor/) 或下载到本地的 [Graphviz工具](https://graphviz.org/) 生成引文网络图。 

In [None]:
graph = GraphViz(docs_df, citation_relationship, source_type)

# 选取LSC最高的100篇文献
doc_indices = (
    citation_relationship[citation_relationship["LCS"] > 0]
    .sort_values("LCS", ascending=False)
    .index[:100]
    .tolist()
)
graph_dot_file = graph.generate_dot_file(doc_indices)
print(graph_dot_file)

In [None]:
# 选取LSC大于等于5的文献
doc_indices = citation_relationship[citation_relationship["LCS"] >= 5].index.tolist()
graph_dot_file = graph.generate_dot_file(doc_indices)
print(graph_dot_file)

In [None]:
# 查看doc_index为10的文献的参考文献网络图
graph_dot_file = graph.generate_dot_file(10, edge_type="cited")
print(graph_dot_file)

In [None]:
# 查看doc_index为10的文献的施引文献网络图
graph_dot_file = graph.generate_dot_file(10, edge_type="cited")
print(graph_dot_file)

In [None]:
# 查看doc_index为10的文献的引文网络图
graph_dot_file = graph.generate_dot_file(10)
print(graph_dot_file)

In [None]:
# 导出dot文件
with open(os.path.join(folder_path, "result", "graph.dot"), "w") as f:
    f.write(graph_dot_file)

In [None]:
# 查看图节点信息
graph_node_info = graph.generate_graph_node_info()
graph_node_info

In [None]:
# 导出图节点信息
graph_node_info.to_excel(
    os.path.join(folder_path, "result", "graph_node_info.xlsx"), index=False
)