In [1]:
import pyarrow.parquet as pq
import json
from typing import List, Dict, Generator
from tqdm import tqdm
from datasets import Dataset
import gc

# HDFS上Parquet文件的路径
hdfs_path = "viewfs://hadoop-lt-cluster/home/mmu_llm/dw/mmu_llm.db/customjtmath_2013_20/type=normal/part-04999-626445f5-ee23-4a80-b0bd-e35648f16988.c000.snappy.parquet"

def parse_json_content(json_str: str) -> str:
    try:
        json_data = json.loads(json_str)
        return json_data.get('content', '')
    except json.JSONDecodeError:
        return ''

def read_and_parse_parquet(file_path: str, max_chunks: int = None, max_length: int = 50) -> Generator[Dict[str, str], None, None]:
    try:
        parquet_file = pq.ParquetFile(file_path)
        print(f"文件包含 {parquet_file.num_row_groups} 个行组")
        print(f"文件模式: {parquet_file.schema}")
        
        for i in tqdm(range(min(max_chunks or float('inf'), parquet_file.num_row_groups))):
            table = parquet_file.read_row_group(i)
            df = table.to_pandas()
            if 'text' in df.columns:
                for text in df['text']:
                    content = parse_json_content(text)[:max_length]
                    if content:
                        yield content
            print(f"处理完第 {i+1} 个行组")
            if max_chunks and i + 1 >= max_chunks:
                print(f"已达到指定的最大块数 {max_chunks}，停止读取")
                break
    except Exception as e:
        print(f"读取文件时出错: {e}")

# 读取并解析数据
parsed_data = list(read_and_parse_parquet(hdfs_path, max_chunks=1))[:10000]

if parsed_data:
    print(f"\n成功读取并解析数据")
    print(f"总共解析的数据条数: {len(parsed_data)}")
    print("前5条解析后的内容:")
    for item in parsed_data[:5]:
        print(item[:100] + '...')  # 只打印每条内容的前100个字符

    # 将解析后的数据转换为Hugging Face Dataset格式
    # dataset = Dataset.from_list(parsed_data)
    texts = parsed_data
    gc.collect()  # 强制进行垃圾回收

else:
    print("无法读取或解析数据")


  from .autonotebook import tqdm as notebook_tqdm
24/07/01 10:49:28 WARN util.NativeCodeLoader main: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/07/01 10:49:28 WARN shortcircuit.DomainSocketFactory main: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
24/07/01 10:49:28 INFO speed4j pool-simple-buffer-trigger-thread-[perf]: Statistics from 2024-07-01 10:48:28 to 2024-07-01 10:49:28
24/07/01 10:49:28 INFO speed4j pool-simple-buffer-trigger-thread-[perf]: Tag                                                           Avg(ms)      Min      Max  Std Dev     95th     99th   99.5th   Count
24/07/01 10:49:28 INFO speed4j pool-simple-buffer-trigger-thread-[perf]: dataarch.hdfs.suzhou03.mmu_llm.12664-dtmachine.2.6.0U60.3.4-cdh5.10.0-CLIENT-RELEASE.hadoop-lt-cluster     0.00     0.00     0.00     0.00     0.00     0.00     0.00       1
24/07/01 10:49:28 INFO speed4j pool-simple-buffer-trigger-thre

文件包含 7 个行组
文件模式: <pyarrow._parquet.ParquetSchema object at 0x7f2725c2d640>
required group field_id=-1 spark_schema {
  optional binary field_id=-1 text (String);
}



  0%|          | 0/1 [00:00<?, ?it/s]24/07/01 10:49:29 WARN hdfs.DFSClient main: hedgedFetchBlockByteRange waited 50ms to read from DatanodeInfoWithStorage[10.80.139.102:50010,DS-a87049f5-11b7-4f0e-8be0-d817ee9b1a19,DISK] LocatedBlock{BP-1561302996-10.46.134.41-1572878936413:blk_54524707756_53542890497; getBlockSize()=268435456; corrupt=false; offset=0; activeIndex=3; locs=[DatanodeInfoWithStorage[10.80.139.102:50010,DS-a87049f5-11b7-4f0e-8be0-d817ee9b1a19,DISK], DatanodeInfoWithStorage[10.80.122.216:50010,DS-122a497f-aa10-4676-8f3b-ccf3bc289174,DISK], DatanodeInfoWithStorage[10.80.139.99:50010,DS-99e35677-7266-4195-ace9-3f3c1c8ca3e7,DISK]]} 4 145656881; spawning hedged read
  0%|          | 0/1 [00:04<?, ?it/s]

处理完第 1 个行组
已达到指定的最大块数 1，停止读取

成功读取并解析数据
总共解析的数据条数: 10000
前5条解析后的内容:
! 07/24/2000 mhamer /tmp/l-1-81-nc.onoff.bathnav! ...
! 07/24/2000 mhamer /tmp/l-4-79-sc.onoff.bathgravn...
! 07/24/2000 mhamer /tmp/l-7-77-wg.onoff.multichan...
! Zum Reiherhorst 32, Stelle Jenny
Ahoi,
wollte no...
! thread theory ♥
« previous entry | next entry »
...





In [4]:
import pandas as pd
# 获取当前脚本的目录
current_dir = os.path.dirname(os.path.abspath(__file__))
# 将 src 目录的父目录添加到 Python 路径
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

from src.text_clustering import ClusterClassifier
from cycler import cycler
import matplotlib.pyplot as plt

# Ensure you have a pandas DataFrame named pandas_df
# Example:
# pandas_df = pd.DataFrame({'content': ["text1", "text2", "text3"]})

# Create an instance of ClusterClassifier
cc = ClusterClassifier(embed_device="cpu")  # Use "cuda" if you have a GPU

# Run the pipeline on the 'content' column

embs, labels, summaries = cc.fit(texts)

# Customize color scheme (optional)
default_cycler = (cycler(color=[
    "#0F0A0A", "#FF6600", "#FFBE00", "#496767", "#87A19E",
    "#FF9200", "#0F3538", "#F8E08E", "#0F2021", "#FAFAF0"
]))
plt.rc('axes', prop_cycle=default_cycler)

# Visualize the results
cc.show()

# Save the classifier (optional)
cc.save("./content_clusters")

# Print cluster summaries
for i, summary in enumerate(summaries):
    print(f"Cluster {i}: {summary}")

# If you want to classify new texts later:
# new_texts = ["Some new text", "Another new text"]
# cluster_labels, embeddings = cc.infer(new_texts, top_k=1)


ImportError: cannot import name 'ClusterClassifier' from 'src' (unknown location)