In [4]:
pip install spacy

Collecting spacy
  Downloading spacy-3.7.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downl

In [13]:
!python -m spacy download zh_core_web_sm

Collecting zh-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl (48.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 MB[0m [31m592.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
Collecting spacy-pkuseg<0.1.0,>=0.0.27 (from zh-core-web-sm==3.7.0)
  Downloading spacy_pkuseg-0.0.33-cp312-cp312-macosx_11_0_arm64.whl.metadata (13 kB)
Downloading spacy_pkuseg-0.0.33-cp312-cp312-macosx_11_0_arm64.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m521.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: spacy-pkuseg, zh-core-web-sm
Successfully installed spacy-pkuseg-0.0.33 zh-core-web-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('zh_core_web_sm')


In [37]:
import mobi
import json
import spacy
 
# Load Chinese language model
nlp = spacy.load("zh_core_web_sm")

def annotate_sentence(sentence):
    doc = nlp(sentence)
    
    segmentation = [token.text for token in doc]
    pos_tagging = [{"word": token.text, "pos": token.pos_} for token in doc]
    ner = [{"entity": ent.text, "type": ent.label_} for ent in doc.ents]
    
    dependency_parsing = {
        "type": "SD",
        "structure": [{"relation": token.dep_, "head": token.head.text, "dependent": token.text} for token in doc]
    }
    
    constituency_parsing = {
        "type": "Chinese Tree Bank",
        "structure": " ".join([token.text_with_ws for token in doc]) # Simulating TreeBank structure
    }
    
    semantic_dependency_analysis = [
        {"relation": token.dep_, "head": token.head.text, "dependent": token.text} for token in doc
    ]
    
    return {
        "sentence": sentence,
        "segmentation": segmentation,
        "pos_tagging": pos_tagging,
        "ner": ner,
        "dependency_parsing": dependency_parsing,
        "constituency_parsing": constituency_parsing,
        "semantic_dependency_analysis": semantic_dependency_analysis
    }

def process_text_file(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
        half_length = len(content) // 3
        content = content[:half_length]
        doc = nlp(content)
        sentences = [sent.text for sent in doc.sents]
    
    annotations = [annotate_sentence(sentence.strip()) for sentence in sentences if sentence.strip()]
    
    return {"annotations": annotations}

# Replace with the path to your text file
file_path = "汪曾祺.txt"
annotations = process_text_file(file_path)

# Save annotations to a JSON file
output_file = "./汪曾祺_标注.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(annotations, f, ensure_ascii=False, indent=4)

print(f"Annotations saved to {output_file}")


Annotations saved to ./汪曾祺_标注.json


In [None]:
import json
import glob

# Define the path where your JSON files are located
path = "原创汉语/*.json"

# Initialize an empty list to store combined data
combined_data = []

# Iterate over all JSON files in the directory
for json_file in glob.glob(path):
    with open(json_file, 'r') as file:
        data = json.load(file)
        combined_data.extend(data)  # Assuming each JSON file contains a list of objects

# Write the combined data into a new JSON file
with open("combined.json", 'w') as output_file:
    json.dump(combined_data, output_file, indent=4)

print("All JSON files have been combined successfully into combined.json")
