In [None]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
# from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F
from collections import defaultdict
from datetime import timedelta
import re
import logging

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def preprocess_text(text):
    """Предварительная обработка текста без удаления стоп-слов."""
    text = text.lower()
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9\s]', '', text)
    return text


# Загрузка модели RuBERT и токенизатора
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Определяем устройство для вычислений
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


def embed_text(texts):
    """Создает эмбеддинги для списка текстов на GPU."""
    with torch.no_grad():
        encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        model_output = model(**encoded_input)
        embeddings = model_output.last_hidden_state[:, 0, :].cpu()  # Перенос на CPU для дальнейшей работы
        return embeddings.numpy()


def find_duplicates_with_rubert(df, window_days=2, similarity_threshold=0.8):
    df = df.copy()
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df = df.sort_values('datetime')
    df['processed_text'] = df['text'].apply(preprocess_text)

    # Создание эмбеддингов для всех текстов
    embeddings = embed_text(df['processed_text'].tolist())
    
    duplicate_groups = defaultdict(set)
    processed_indices = set()

    for i in range(len(df)):
        current_idx = df.index[i]
        if current_idx in processed_indices:
            continue

        current_date = df.iloc[i]['datetime']
        date_min = current_date - timedelta(days=window_days)
        date_max = current_date + timedelta(days=window_days)

        mask_window = (
            (df['datetime'] >= date_min) &
            (df['datetime'] <= date_max) &
            ~df.index.isin(processed_indices)
        )
        window_indices = df[mask_window].index

        if len(window_indices) > 1:
            window_embeddings = embeddings[window_indices]

            emb1 = torch.tensor([embeddings[i]])
            emb2 = torch.tensor(window_embeddings)

            similarities = F.cosine_similarity(emb1, emb2).numpy()

            similar_indices = np.where(similarities > similarity_threshold)[0]
            if len(similar_indices) > 1:
                similar_indices_full = window_indices[similar_indices]
                earliest_idx = df.loc[similar_indices_full, 'datetime'].idxmin()
                duplicate_groups[earliest_idx].update(
                    idx for idx in similar_indices_full if idx != earliest_idx
                )
                processed_indices.update(similar_indices_full)

    all_duplicates = set(idx for d in duplicate_groups.values() for idx in d)
    df_cleaned = df.drop(index=list(all_duplicates))
    df_cleaned.drop(['datetime', 'processed_text'], axis=1, inplace=True)

    logging.info(f"Найдено {len(all_duplicates)} дубликатов в {len(duplicate_groups)} группах")
    logging.info(f"Осталось {len(df_cleaned)} уникальных новостей")

    return df_cleaned


In [None]:
def main():
    # Читаем CSV файл
    df = pd.read_csv('chunk_news.csv')
    
    # Удаляем дубликаты с временным окном в 2 дня
    df_cleaned = find_duplicates_with_rubert(df, window_days=1, similarity_threshold=0.8)
    
    # Сохраняем результат
    df_cleaned.to_csv('test_news.csv', index=False)
    
if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
from datetime import timedelta
import re
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Загрузка модели RuBERT и токенизатора
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Определяем устройство для вычислений
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


def preprocess_text(text):
    """Предварительная обработка текста без удаления стоп-слов."""
    text = text.lower()
    text = re.sub(r'[^а-яА-Яa-zA-Z0-9\s]', '', text)
    return text


def embed_text(texts):
    """Создает эмбеддинги для списка текстов на GPU."""
    filtered_texts = []
    skipped_count = 0

    # Пропускаем тексты длиннее 512 токенов
    for text in texts:
        tokenized_length = len(tokenizer.tokenize(text))
        if tokenized_length <= 512:
            filtered_texts.append(text)
        else:
            skipped_count += 1

    logging.info(f"Пропущено {skipped_count} текстов из-за превышения 512 токенов")

    if not filtered_texts:
        return None

    with torch.no_grad():
        encoded_input = tokenizer(
            filtered_texts,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).to(device)
        model_output = model(**encoded_input)
        embeddings = model_output.last_hidden_state[:, 0, :].cpu()
        return embeddings.numpy()


def find_duplicates_with_rubert(df, window_days=2, similarity_threshold=0.8):
    df = df.copy()
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df = df.sort_values('datetime')
    df['processed_text'] = df['text'].apply(preprocess_text)

    # Создание эмбеддингов для всех текстов
    embeddings = embed_text(df['processed_text'].tolist())

    if embeddings is None:
        logging.warning("Все тексты были пропущены. Нет данных для обработки.")
        return df

    duplicate_groups = defaultdict(set)
    processed_indices = set()

    for i in range(len(df)):
        current_idx = df.index[i]
        if current_idx in processed_indices:
            continue

        current_date = df.iloc[i]['datetime']
        date_min = current_date - timedelta(days=window_days)
        date_max = current_date + timedelta(days=window_days)

        mask_window = (
            (df['datetime'] >= date_min) &
            (df['datetime'] <= date_max) &
            ~df.index.isin(processed_indices)
        )
        window_indices = df[mask_window].index

        if len(window_indices) > 1:
            window_embeddings = embeddings[window_indices]
            similarities = cosine_similarity([embeddings[i]], window_embeddings)[0]

            similar_indices = np.where(similarities > similarity_threshold)[0]
            if len(similar_indices) > 1:
                similar_indices_full = window_indices[similar_indices]
                earliest_idx = df.loc[similar_indices_full, 'datetime'].idxmin()
                duplicate_groups[earliest_idx].update(
                    idx for idx in similar_indices_full if idx != earliest_idx
                )
                processed_indices.update(similar_indices_full)

    all_duplicates = set(idx for d in duplicate_groups.values() for idx in d)
    df_cleaned = df.drop(index=list(all_duplicates))
    df_cleaned.drop(['datetime', 'processed_text'], axis=1, inplace=True)

    logging.info(f"Найдено {len(all_duplicates)} дубликатов в {len(duplicate_groups)} группах")
    logging.info(f"Осталось {len(df_cleaned)} уникальных новостей")

    return df_cleaned


def main():
    # Читаем CSV файл
    df = pd.read_csv('chunk_news.csv')
    
    # Удаляем дубликаты с временным окном в 1 день
    df_cleaned = find_duplicates_with_rubert(df, window_days=1, similarity_threshold=0.8)
    
    # Сохраняем результат
    df_cleaned.to_csv('test_news.csv', index=False)
    
if __name__ == "__main__":
    main()

In [None]:
def main():
    # Читаем CSV файл
    df = pd.read_csv('chunk_news.csv')
    
    # Удаляем дубликаты с временным окном в 1 день
    df_cleaned = find_duplicates_with_rubert(df, window_days=1, similarity_threshold=0.8)
    
    # Сохраняем результат
    df_cleaned.to_csv('test_news.csv', index=False)
    
if __name__ == "__main__":
    main()