In [1]:
import re
import os
import pandas as pd
import urllib.request
from pathlib import Path
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [2]:

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    print("Загрузка punkt_tab...")
    nltk.download('punkt_tab', quiet=False)

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Загрузка punkt...")
    nltk.download('punkt', quiet=False)

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("Загрузка wordnet...")
    nltk.download('wordnet', quiet=False)

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Загрузка stopwords...")
    nltk.download('stopwords', quiet=False)

print("Библиотеки успешно импортированы и ресурсы NLTK загружены")


Загрузка wordnet...
Библиотеки успешно импортированы и ресурсы NLTK загружены


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Загрузка датасета AG News
train_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv"
test_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv"

# Создание директории для данных
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

# Скачивание файлов
if not (data_dir / "train.csv").exists():
    print("Скачивание train.csv...")
    urllib.request.urlretrieve(train_url, data_dir / "train.csv")
    print("train.csv загружен")

if not (data_dir / "test.csv").exists():
    print("Скачивание test.csv...")
    urllib.request.urlretrieve(test_url, data_dir / "test.csv")
    print("test.csv загружен")

In [4]:
train_csv_path = data_dir / "train.csv"
test_csv_path = data_dir / "test.csv"

In [5]:
train_df = pd.read_csv(train_csv_path, header=None, names=["label", "title", "text"])
test_df  = pd.read_csv(test_csv_path, header=None, names=["label", "title", "text"])

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
train_df.head()


Train shape: (120000, 3)
Test shape: (7600, 3)


Unnamed: 0,label,title,text
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [6]:
train_df = train_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)

train_df["doc_id"] = train_df.index.map(lambda i: f"{i:06d}")
test_df["doc_id"]  = test_df.index.map(lambda i: f"{i:06d}")

train_df["label"] = train_df["label"].astype(str)
test_df["label"]  = test_df["label"].astype(str)

train_df["text_full"] = (
    train_df["title"].fillna("") + ". " + train_df["text"].fillna("")
).str.strip()

test_df["text_full"] = (
    test_df["title"].fillna("") + ". " + test_df["text"].fillna("")
).str.strip()

train_std = train_df[["doc_id", "label", "text_full"]].rename(columns={"text_full": "text"})
test_std  = test_df[["doc_id", "label", "text_full"]].rename(columns={"text_full": "text"})

print(train_std.head())
print("Train std shape:", train_std.shape)
print("Test std shape:",  test_std.shape)


   doc_id label                                               text
0  000000     3  Wall St. Bears Claw Back Into the Black (Reute...
1  000001     3  Carlyle Looks Toward Commercial Aerospace (Reu...
2  000002     3  Oil and Economy Cloud Stocks' Outlook (Reuters...
3  000003     3  Iraq Halts Oil Exports from Main Southern Pipe...
4  000004     3  Oil prices soar to all-time record, posing new...
Train std shape: (120000, 3)
Test std shape: (7600, 3)


In [7]:
import re
import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer


stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()


In [8]:
ABBREVIATIONS = [
    "Mr", "Ms", "Mrs", "Dr", "Prof", "Inc", "Ltd", "Jr", "Sr",
    "U.S", "U.K", "St", "Univ"
]
def protect_abbreviations(text: str):
    for abbr in ABBREVIATIONS:
        text = re.sub(
            rf"\b{abbr}\.",
            abbr.replace(".", "<DOT>") + "<DOT>",
            text
        )
    return text

sentence_end_re = re.compile(r'(?<=[.!?])\s+')

def restore_abbreviations(text: str):
    return text.replace("<DOT>", ".")

def split_to_sentences(text: str):
    if not isinstance(text, str):
        text = str(text)

    text = re.sub(r'\s+', ' ', text.strip())
    if not text:
        return []

    protected = protect_abbreviations(text)

    parts = sentence_end_re.split(protected)

    parts = [restore_abbreviations(p) for p in parts]

    return [p.strip() for p in parts if p.strip()]

test_text = "Dr. Smith spoke to Ms. Brown. Mrs. Johnson agreed! This works."

for s in split_to_sentences(test_text):
    print("→", s)

→ Dr. Smith spoke to Ms. Brown.
→ Mrs. Johnson agreed!
→ This works.


In [9]:
import re

token_pattern = re.compile(
    r"""
    (?:\+?\d[\d\-\(\)\s]{7,}\d)                 # телефоны, напр. +7-901-000-00-00
    | (?:[\w\.-]+@[\w\.-]+\.\w+)                # email, напр. abc@xyz.com
    | (?:[:;=8][\-^']?[)DdpP(/\\])              # смайлики :) ;-) :D
    | (?:\d+(?:[.,]\d+)*)                       # числа
    | (?:[A-Za-zА-Яа-яЁё]+(?:[-'][A-Za-zА-Яа-яЁё]+)*)  # слова с дефисами/апострофами
    | (?:[^\s])                                 # одиночный не-пробельный символ
    """,
    re.VERBOSE
)


def tokenize(text: str):
    if not isinstance(text, str):
        text = str(text)
    return [m.group(0) for m in token_pattern.finditer(text)]

tokens = tokenize("Email me at test@example.com or call +1-202-555-01-23")
print("Все токены:", tokens)
print("\nСпециальные токены:")
for token in tokens:
    if '@' in token:
        print(f"  {token} (EMAIL)")
    elif re.match(r'\+?\d[\d\-\(\)\s]{7,}\d', token):
        print(f"  {token} (PHONE)")


Все токены: ['Email', 'me', 'at', 'test@example.com', 'or', 'call', '+1-202-555-01-23']

Специальные токены:
  test@example.com (EMAIL)
  +1-202-555-01-23 (PHONE)


In [10]:
from nltk.stem import SnowballStemmer, WordNetLemmatizer

stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def stem_token(token: str) -> str:
    if re.fullmatch(r"[A-Za-z]+", token):
        return stemmer.stem(token.lower())
    return token.lower()

def lemmatize_token(token: str, pos: str = "n") -> str:
    if re.fullmatch(r"[A-Za-z]+", token):
        return lemmatizer.lemmatize(token.lower(), pos=pos)
    return token.lower()

# мини-тест
for t in tokenize("Dogs were running, cats slept."):
    print(t, "->", stem_token(t), "/", lemmatize_token(t))


Dogs -> dog / dog
were -> were / were
running -> run / running
, -> , / ,
cats -> cat / cat
slept -> slept / slept
. -> . / .


In [11]:
def annotate_sentence(sentence: str):

    tokens = tokenize(sentence)
    out = []
    for tok in tokens:
        stem = stem_token(tok)
        lemma = lemmatize_token(tok)
        out.append((tok, stem, lemma))
    return out

print(annotate_sentence("Dogs were running fast!"))


[('Dogs', 'dog', 'dog'), ('were', 'were', 'were'), ('running', 'run', 'running'), ('fast', 'fast', 'fast'), ('!', '!', '!')]


In [12]:
def annotate_document(text: str):
    sentences = split_to_sentences(text)
    return [annotate_sentence(s) for s in sentences]

def doc_to_tsv_str(text: str) -> str:
    """
    <token>\t<stem>\t<lemma>
    """
    sentences_ann = annotate_document(text)
    lines = []
    for sent_ann in sentences_ann:
        for token, stem, lemma in sent_ann:
            lines.append(f"{token}\t{stem}\t{lemma}")
        lines.append("")

    if lines and lines[-1] == "":
        lines = lines[:-1]
    return "\n".join(lines)



In [13]:
train_df = pd.read_csv(train_csv_path, header=None, names=["label", "title", "text"])
test_df  = pd.read_csv(test_csv_path,  header=None, names=["label", "title", "text"])

train_df = train_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)

train_df["doc_id"] = train_df.index.map(lambda i: f"{i:06d}")
test_df["doc_id"]  = test_df.index.map(lambda i: f"{i:06d}")

train_df["label"] = train_df["label"].astype(str)
test_df["label"]  = test_df["label"].astype(str)

train_df["text_full"] = (
    train_df["title"].fillna("") + ". " + train_df["text"].fillna("")
).str.strip()

test_df["text_full"] = (
    test_df["title"].fillna("") + ". " + test_df["text"].fillna("")
).str.strip()

train_std = train_df[["doc_id", "label", "text_full"]].rename(columns={"text_full": "text"})
test_std  = test_df[["doc_id", "label", "text_full"]].rename(columns={"text_full": "text"})

train_std.head()


Unnamed: 0,doc_id,label,text
0,0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,4,3,"Oil prices soar to all-time record, posing new..."


In [14]:
from pathlib import Path

annotated_dir = Path(f"assets/annotated-corpus")
annotated_dir.mkdir(parents=True, exist_ok=True)

def create_annotated_subset_from_df(df, subset_name: str):
    subset_root = annotated_dir / subset_name
    subset_root.mkdir(parents=True, exist_ok=True)

    for idx, row in df.iterrows():
        doc_id = str(row["doc_id"])
        label = str(row["label"])
        text  = str(row["text"])

        label_dir = subset_root / label
        label_dir.mkdir(parents=True, exist_ok=True)

        tsv_path = label_dir / f"{doc_id}.tsv"
        tsv_content = doc_to_tsv_str(text)

        with open(tsv_path, "w", encoding="utf-8") as f:
            f.write(tsv_content)
        if idx % 5000 == 0:
            print(f"{subset_name}: обработано {idx} документов...")

    print(f"Готово: {subset_name} → {subset_root}")


In [16]:
create_annotated_subset_from_df(train_std, "train")
create_annotated_subset_from_df(test_std, "test")


train: обработано 0 документов...
train: обработано 5000 документов...
train: обработано 10000 документов...
train: обработано 15000 документов...
train: обработано 20000 документов...
train: обработано 25000 документов...
train: обработано 30000 документов...
train: обработано 35000 документов...
train: обработано 40000 документов...
train: обработано 45000 документов...
train: обработано 50000 документов...
train: обработано 55000 документов...
train: обработано 60000 документов...
train: обработано 65000 документов...
train: обработано 70000 документов...
train: обработано 75000 документов...
train: обработано 80000 документов...
train: обработано 85000 документов...
train: обработано 90000 документов...
train: обработано 95000 документов...
train: обработано 100000 документов...
train: обработано 105000 документов...
train: обработано 110000 документов...
train: обработано 115000 документов...
Готово: train → assets\annotated-corpus\train
test: обработано 0 документов...
test: обра

In [17]:
import random
from pathlib import Path

sample_label = random.choice(["1", "2", "3", "4"])
sample_file = next((annotated_dir / "train" / sample_label).glob("*.tsv"))

print("Файл:", sample_file)
print("--- Содержимое фрагмента ---")
with open(sample_file, "r", encoding="utf-8") as f:
    for i in range(15):
        print(f.readline().rstrip())


Файл: assets\annotated-corpus\train\3\000000.tsv
--- Содержимое фрагмента ---
Wall	wall	wall
St	st	st
.	.	.
Bears	bear	bear
Claw	claw	claw
Back	back	back
Into	into	into
the	the	the
Black	black	black
(	(	(
Reuters	reuter	reuters
)	)	)
.	.	.

Reuters	reuter	reuters


In [18]:

examples = [
    ("left", "n", "v"),        # left / leave
    ("better", "a", "v"),      # good / better
    ("meeting", "n", "v"),     # meeting / meet
    ("building", "n", "v"),    # building / build
    ("written", "a", "v"),     # written / write
    ("lost", "a", "v"),        # lost / lose
    ("bound", "a", "v"),       # bound / bind
    ("closed", "a", "v"),      # closed / close
    ("increased", "a", "v"),   # increased / increase
    ("reduced", "a", "v"),     # reduced / reduce
    ("produced", "a", "v"),    # produced / produce
    ("developed", "a", "v"),   # developed / develop
    ("broken", "a", "v"),      # broken / break
    ("grown", "a", "v"),       # grown / grow
]


for word, pos1, pos2 in examples:
    lemma_1 = lemmatizer.lemmatize(word, pos=pos1)
    lemma_2 = lemmatizer.lemmatize(word, pos=pos2)
    print(f"{word:8} → {pos1}: {lemma_1:8} | {pos2}: {lemma_2:8}")


left     → n: left     | v: leave   
better   → a: good     | v: better  
meeting  → n: meeting  | v: meet    
building → n: building | v: build   
written  → a: written  | v: write   
lost     → a: lost     | v: lose    
bound    → a: bound    | v: bind    
closed   → a: closed   | v: close   
increased → a: increased | v: increase
reduced  → a: reduced  | v: reduce  
produced → a: produced | v: produce 
developed → a: developed | v: develop 
broken   → a: broken   | v: break   
grown    → a: grown    | v: grow    
