In [2]:
import json
from transformers import AutoTokenizer

# Загрузка токенизатора
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Функция для преобразования JSON в BIO
def convert_to_bio(json_path, output_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    bio_data = []
    for record in data:
        text = record["data"]["text"]
        entities = record["annotations"][0]["result"] if record["annotations"] else []
        tokenized_text = tokenizer.tokenize(text)
        token_labels = ["O"] * len(tokenized_text)

        for entity in entities:
            if "value" in entity:
                start = entity["value"]["start"]
                end = entity["value"]["end"]
                label = entity["value"]["labels"][0]
                char_index = 0

                for i, token in enumerate(tokenized_text):
                    token_start = text.find(token, char_index)
                    token_end = token_start + len(token)
                    char_index = token_end
                    if token_start >= start and token_end <= end:
                        token_labels[i] = f"B-{label}" if token_start == start else f"I-{label}"

        for token, label in zip(tokenized_text, token_labels):
            bio_data.append(f"{token}\t{label}")
        bio_data.append("")  # Пустая строка для разделения предложений

    # Сохранение в BIO файл
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write("\n".join(bio_data))

# Использование функции
json_path = "/Users/rii_beltz/Desktop/candidate_classifier/src/data/project-5-at-2025-01-10-14-23-dbeec074.json"  # Ваш JSON файл
output_path = "bio_data.txt"  # Имя выходного файла
convert_to_bio(json_path, output_path)


Token indices sequence length is longer than the specified maximum sequence length for this model (602 > 512). Running this sequence through the model will result in indexing errors


In [3]:
import re
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
MAX_SEQ_LEN = 512

def split_text_into_chunks(text, max_length=MAX_SEQ_LEN):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)  # Разделяем по предложениям
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        tokenized_sentence = tokenizer.tokenize(sentence)
        if current_length + len(tokenized_sentence) > max_length:
            chunks.append(current_chunk)
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += len(tokenized_sentence)

    if current_chunk:
        chunks.append(current_chunk)

    return [' '.join(chunk) for chunk in chunks]


In [4]:
def convert_to_bio_with_chunks(json_path, output_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    bio_data = []
    for record in data:
        text = record["data"]["text"]
        entities = record["annotations"][0]["result"] if record["annotations"] else []

        # Разбиваем текст на сегменты
        chunks = split_text_into_chunks(text)

        for chunk in chunks:
            tokenized_text = tokenizer.tokenize(chunk)
            token_labels = ["O"] * len(tokenized_text)

            for entity in entities:
                if "value" in entity:
                    start = entity["value"]["start"]
                    end = entity["value"]["end"]
                    label = entity["value"]["labels"][0]
                    char_index = 0

                    for i, token in enumerate(tokenized_text):
                        token_start = chunk.find(token, char_index)
                        token_end = token_start + len(token)
                        char_index = token_end
                        if token_start >= start and token_end <= end:
                            token_labels[i] = f"B-{label}" if token_start == start else f"I-{label}"

            for token, label in zip(tokenized_text, token_labels):
                bio_data.append(f"{token}\t{label}")
            bio_data.append("")  # Пустая строка для разделения предложений

    # Сохраняем в файл
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write("\n".join(bio_data))


In [None]:
import re

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def to_lowercase(text):
    return text.lower()

def remove_extra_spaces(text):
    return ' '.join(text.split())

def preprocess_text(text):
    text = to_lowercase(text)
    text = remove_punctuation(text)
    text = remove_extra_spaces(text)
    return text


In [13]:
import json
import re
from transformers import AutoTokenizer

# Загрузка токенизатора
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
MAX_SEQ_LEN = 512

# Функция для разделения текста на сегменты
def split_text_into_chunks(text, max_length=MAX_SEQ_LEN):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)  # Разделяем по предложениям
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        tokenized_sentence = tokenizer.tokenize(sentence)
        if current_length + len(tokenized_sentence) > max_length:
            chunks.append(current_chunk)
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += len(tokenized_sentence)

    if current_chunk:
        chunks.append(current_chunk)

    return [' '.join(chunk) for chunk in chunks]

# Функция для преобразования JSON в BIO с учетом длинных текстов
def convert_to_bio_with_chunks(json_path, output_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    bio_data = []
    for record in data:
        text = record["data"]["text"]
        entities = record["annotations"][0]["result"] if record["annotations"] else []

        # Разбиваем текст на сегменты
        chunks = split_text_into_chunks(text)

        for chunk in chunks:
            tokenized_text = tokenizer.tokenize(chunk)
            token_labels = ["O"] * len(tokenized_text)

            for entity in entities:
                if "value" in entity:
                    start = entity["value"]["start"]
                    end = entity["value"]["end"]
                    label = entity["value"]["labels"][0]
                    char_index = 0

                    for i, token in enumerate(tokenized_text):
                        token_start = chunk.find(token, char_index)
                        token_end = token_start + len(token)
                        char_index = token_end
                        if token_start >= start and token_end <= end:
                            token_labels[i] = f"B-{label}" if token_start == start else f"I-{label}"

            # Добавляем токены и их метки в результат
            for token, label in zip(tokenized_text, token_labels):
                bio_data.append(f"{token}\t{label}")
            bio_data.append("")  # Пустая строка для разделения предложений

    # Сохраняем в файл в формате TSV
    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write("\n".join(bio_data))

    print(f"BIO данные успешно сохранены в файл: {output_path}")

# Вызов функции
json_path = "/Users/rii_beltz/Downloads/project-5-at-2025-01-10-14-23-dbeec074.json"  # Путь к вашему JSON файлу
output_path = "bio_data.tsv"  # Имя выходного файла

convert_to_bio_with_chunks(json_path, output_path)


Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors


BIO данные успешно сохранены в файл: bio_data.tsv


In [11]:
import json
import pandas as pd

def extract_from_json(json_path, output_csv_path):
    """
    Извлекает текст и сущности из JSON-файла и записывает их в CSV.
    """
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    records = []

    for entry in data:
        text = entry["data"]["text"]  # Текст вакансии
        entities = entry["annotations"][0]["result"] if entry["annotations"] else []

        record = {"text": text}

        for entity in entities:
            start = entity["value"]["start"]
            end = entity["value"]["end"]
            label = entity["value"]["labels"][0]
            value = entity["value"]["text"]

            # Добавляем сущность в колонку (группируем сущности с одинаковыми метками)
            if label not in record:
                record[label] = []
            record[label].append(value)

        # Преобразуем списки сущностей в строки
        for key in record:
            if isinstance(record[key], list):
                record[key] = "; ".join(record[key])

        records.append(record)

    # Создаём DataFrame
    df = pd.DataFrame(records)

    # Сохраняем в CSV
    df.to_csv(output_csv_path, index=False, encoding="utf-8")
    print(f"Данные успешно сохранены в файл: {output_csv_path}")

# Путь к вашему JSON-файлу
json_path = "/Users/rii_beltz/Desktop/project-5-at-2025-01-10-14-23-dbeec074.json"  # Замените на ваш путь
output_csv_path = "extracted_data.csv"  # Имя выходного CSV-файла

# Вызов функции
extract_from_json(json_path, output_csv_path)


Данные успешно сохранены в файл: extracted_data.csv
