In [2]:
import re
import os
import pandas as pd
import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from tqdm import tqdm

In [15]:
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [4]:
def get_wordnet_pos(treebank_tag):
    """Конвертирует POS-теги из формата Penn Treebank в формат WordNet"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
def segment_sentences(text):
    """
    Сегментация текста на предложения с использованием регулярных выражений.
    Обрабатывает случаи с сокращениями (Dr., Mr., etc.) и кавычками.
    """
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = re.split(
        r'(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<![A-Z][a-z]{2}\.)(?<=[.!?])\s+(?=[\"\']*[A-Z])', 
        text
    )
    return sentences

In [None]:
def tokenize(text):
    """
    Токенизация текста с обработкой сложных случаев:
    - email: user@example.com
    - телефонные номера: +7-901-000-00-00, 8(918)3213412
    - эмодзи: :), :(, ;)
    - сокращения: Dr., Mr., etc.
    - математические выражения: a=b*c
    """
    pattern = r"""
        [\w.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+ |
        (?:\+?\d{1,3}[-.\s]?)?                
        \(?\d{3}\)?[-.\s]?                    
        \d{3}[-.\s]?\d{4}                     
        |(?:\+?\d{1,3}[-.\s]?)?               
        \(?\d{3}\)?[-.\s]?                    
        \d{3}[-.\s]?\d{2}[-.\s]?\d{2}         
        |
        [:=;xX][oO\-]?[D\)\]\(\]/\\OpP3]      
        |<3                                   
        |
        \b(?:[a-zA-Z]\s*[=+\-*/^<>()]+\s*[a-zA-Z0-9\s]*)+\b
        |
        \b(?:[A-Z]\.)+                        
        |\b(?:[A-Z][a-z]+\.)(?:[A-Z][a-z]+\.)*  
        |
        \d+(?:[.,]\d+)*                       
        |
        \b\w+(?:'\w+)?\b                      
        |
        [^\w\s]                               
    """
    
    tokens = re.findall(pattern, text, re.VERBOSE)
    return tokens

In [7]:
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [None]:
def process_text(text):
    """
    Полная обработка текста:
    1. Сегментация на предложения
    2. Токенизация каждого предложения
    3. Стемминг и лемматизация токенов
    """
    sentences = segment_sentences(text)
    annotated_sentences = []
    
    for sent in sentences:
        tokens = tokenize(sent)
        pos_tags = nltk.pos_tag(tokens)
        annotated_tokens = []
        
        for token, (word, pos_tag) in zip(tokens, pos_tags):
            stem = stemmer.stem(token) if token.isalpha() else token
            
            if token.isalpha():
                wordnet_pos = get_wordnet_pos(pos_tag)
                lemma = lemmatizer.lemmatize(token, pos=wordnet_pos)
            else:
                lemma = token
                
            annotated_tokens.append((token, stem, lemma))
        
        annotated_sentences.append(annotated_tokens)
    
    return annotated_sentences

In [None]:
def save_annotations(annotations, output_path):
    """
    Сохраняет аннотации в формате TSV:
    token\tstem\tlemma
    (пустая строка между предложениями)
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        for i, sent in enumerate(annotations):
            for token, stem, lemma in sent:
                f.write(f"{token}\t{stem}\t{lemma}\n")
            if i < len(annotations) - 1:
                f.write("\n")

In [10]:
train_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv"
test_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv"

train_df = pd.read_csv(train_url, header=None, names=['class', 'title', 'description'])
test_df = pd.read_csv(test_url, header=None, names=['class', 'title', 'description'])

In [11]:
train_df['text'] = train_df['title'] + " " + train_df['description']
test_df['text'] = test_df['title'] + " " + test_df['description']

In [12]:
class_mapping = {
    1: 'World',
    2: 'Sports',
    3: 'Business',
    4: 'Sci_Tech'
}

In [13]:
base_dir = "C:/Users/Paul/Projects/nlp-25/projects/pn-pren/lab1"
for dataset in ['train', 'test']:
    for class_name in class_mapping.values():
        os.makedirs(f"{base_dir}/{dataset}/{class_name}", exist_ok=True)

In [17]:
for df, dataset_type in [(train_df.head(100), 'train'), (test_df.head(100), 'test')]:
    print(f"{dataset_type}:")
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        doc_id = f"{idx:06d}"
        class_name = class_mapping[row['class']]
        text = row['text']
        
        annotations = process_text(text)
        
        output_path = f"{base_dir}/{dataset_type}/{class_name}/{doc_id}.tsv"
        save_annotations(annotations, output_path)

train:


100%|██████████| 100/100 [00:00<00:00, 872.36it/s]


test:


100%|██████████| 100/100 [00:00<00:00, 819.15it/s]


In [18]:
text1 = "The river bank was flooded."
text2 = "I deposited money at the bank."
text3 = "He decided to bank on his friend."

for text in [text1, text2, text3]:
    annotations = process_text(text)
    token, stem, lemma = annotations[0][0]
    print(f"{text}")
    print(f"token: '{token}', lemma: '{lemma}' pos_tag: {nltk.pos_tag([token])[0][1]}")

The river bank was flooded.
token: 'The', lemma: 'The' pos_tag: DT
I deposited money at the bank.
token: 'I', lemma: 'I' pos_tag: PRP
He decided to bank on his friend.
token: 'He', lemma: 'He' pos_tag: PRP


In [None]:
text4 = "The bat flew out of the cave."
text5 = "He swung the bat and hit the ball."

for text in [text4, text5]:
    annotations = process_text(text)
    token, stem, lemma = annotations[0][1]
    print(f"{text}")
    print(f"token: '{token}', lemma: '{lemma}' pos_tag: {nltk.pos_tag([token])[0][1]}")

The bat flew out of the cave.
token: 'bat', lemma: 'bat' pos_tag: NN
He swung the bat and hit the ball.
token: 'swung', lemma: 'swing' pos_tag: NN


In [20]:
text6 = "She will lead the team to victory."
text7 = "The pipe was made of lead."

for text in [text6, text7]:
    annotations = process_text(text)
    token, stem, lemma = annotations[0][2]
    print(f"{text}")
    print(f"token: '{token}', lemma: '{lemma}' pos_tag: {nltk.pos_tag([token])[0][1]}")

She will lead the team to victory.
token: 'lead', lemma: 'lead' pos_tag: NN
The pipe was made of lead.
token: 'was', lemma: 'be' pos_tag: VBD
