In [1]:
import pandas as pd

splits = {'train': 'plain_text/train-00000-of-00001.parquet',
          'test': 'plain_text/test-00000-of-00001.parquet',
          'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])

In [2]:
COUNT_CORP = 5000
df= pd.concat([df[df['label']==0].head(COUNT_CORP//2), df[df['label']==1].head(COUNT_CORP//2)])

In [3]:
df_test = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["test"])
df_test= pd.concat([df_test[df_test['label']==0].head(COUNT_CORP//8), df_test[df_test['label']==1].head(COUNT_CORP//8)])

In [4]:
len(df)

5000

In [5]:
df[df['label']==1]

Unnamed: 0,text,label
12500,Zentropa has much in common with The Third Man...,1
12501,Zentropa is the most original movie I've seen ...,1
12502,Lars Von Trier is never backward in trying out...,1
12503,*Contains spoilers due to me having to describ...,1
12504,That was the first thing that sprang to mind a...,1
...,...,...
14995,No rubbish - no where even near rubbish. Not a...,1
14996,"A weird, witty and wonderful depiction of fami...",1
14997,This first two seasons of this comedy series w...,1
14998,"Typical 90's comedy, situational comedy simila...",1


In [6]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import os

In [7]:
# Скачиваем необходимые ресурсы NLTK
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yaros\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yaros\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yaros\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yaros\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
%load_ext autoreload
%autoreload 2


In [9]:
import re
from sentence_segmenter import SentenceSegmenter, test_problem_cases


test_problem_cases()

=== TEST CASE 1 ===
Input: Hello Dr. Smith! How are you? I'm fine.
Original text: Hello Dr. Smith! How are you? I'm fine.

=== DEBUG PROTECTION PATTERNS ===
Pattern 'common_abbr' matched: 'Dr.'
  Position: 6-9
  Context: ...ello Dr. Smit...
  Dots in match: 1

=== ALL DOTS IN TEXT ===
Dot at position 8: ...Hello Dr. Smith! H...
Dot at position 38: ...? I'm fine....

=== AFTER PROTECTION ===
Protected text: Hello Dr‹DOT› Smith! How are you? I'm fine.
Replacements: {'Dr.': 'Dr‹DOT›'}

=== SENTENCE SPLITTING ===
Position 19: '!' in context 'Smith! How'
  Should split: True

Position 32: '?' in context 'e you? I'm'
  Should split: True

Position 42: '.' in context ' fine.'
  Should split: True

=== FINAL RESULT ===
Sentence 1: Hello Dr. Smith!
Sentence 2: How are you?
Sentence 3: I'm fine.

Output: ['Hello Dr. Smith!', 'How are you?', "I'm fine."]


=== TEST CASE 2 ===
Input: The meeting is at 2:30 p.m. in room 3.14. Don't be late!
Original text: The meeting is at 2:30 p.m. in room 3.14. D

In [10]:
segmentor = SentenceSegmenter(debug=False)

In [11]:
cur = 6
for i, test_case in enumerate(df['text'][cur:cur+1], 1):
    print(f"=== TEST CASE {i} ===")
    print(f"Input: {test_case}")
    sentences = segmentor.segment_text(test_case)
    print(f"Output: {sentences}")

=== TEST CASE 1 ===
Input: Whoever wrote the screenplay for this movie obviously never consulted any books about Lucille Ball, especially her autobiography. I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. I could write a whole list of factual errors, but it would go on for pages. In all, I believe that Lucille Ball is one of those inimitable people who simply cannot be portrayed by anyone other than themselves. If I were Lucie Arnaz and Desi, Jr., I would be irate at how many mistakes were made in this film. The filmmakers tried hard, but the movie seems awfully sloppy to me.
Output: ['Whoever wrote the screenplay for this movie obviously never consulted any books about Lucille Ball, especially her autobiography.', "I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi.", 'I could write a whole list of factual errors, but it would 

In [12]:
from text_tokenizer import TextTokenizer
class TextProcessor:
    def __init__(self):
        self.tokenizer = TextTokenizer()
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.segmenter = SentenceSegmenter()
        self.stopwords = set(stopwords.words('english'))
        self.stopwords.update(["the", "film", "movie", "like"])

    def process_sentence(self, sentence):
        """Обработка одного предложения"""
        tokens = self.tokenizer.tokenize(sentence)
        tokens = [word for word in tokens if word not in self.stopwords and len(word) > 2]
        processed = []

        for token in tokens:
            # Для слов применяем стемминг и лемматизацию
            if re.match(r'\b[a-zA-Z]+\b', token):
                stem = self.stemmer.stem(token.lower())
                lemma = self.lemmatizer.lemmatize(token.lower())
            else:
                # Для не-слов оставляем как есть
                stem = token
                lemma = token

            processed.append((token, stem, lemma))

        return processed

    def process_text(self, text):
        """Обработка всего текста"""
        sentences = self.segmenter.segment_text(text)
        all_processed = []

        for sentence in sentences:
            processed_sentence = self.process_sentence(sentence)
            all_processed.extend(processed_sentence)
            # Добавляем пустую строку между предложениями
            all_processed.append(('', '', ''))

        return all_processed[:-1]  # Убираем последнюю пустую строку

In [13]:
def create_annotation_files(df, split_name='train'):
    """Создание аннотированных файлов для датасета"""
    processor = TextProcessor()
    base_path = f'../assets/annotated-corpus/{split_name}'

    # Создаем директории
    os.makedirs(base_path, exist_ok=True)

    for idx, row in df.iterrows():
        text = row['text']
        label = 'pos' if row['label'] == 1 else 'neg'

        # Создаем поддиректорию для класса
        label_dir = os.path.join(base_path, label)
        os.makedirs(label_dir, exist_ok=True)

        # Обрабатываем текст
        annotations = processor.process_text(text)

        # Сохраняем в TSV
        file_path = os.path.join(label_dir, f"{idx}.tsv")
        with open(file_path, 'w', encoding='utf-8') as f:
            for token, stem, lemma in annotations:
                if token == '' and stem == '' and lemma == '':
                    f.write('\n')  # Разделитель между предложениями
                else:
                    f.write(f"{token}\t{stem}\t{lemma}\n")

        if idx % 100 == 0:
            print(f"Обработано {idx} документов...")

    print(f"Аннотации сохранены в {base_path}")

# Обрабатываем тренировочные данные
create_annotation_files(df, 'train')

Обработано 0 документов...
Обработано 100 документов...
Обработано 200 документов...
Обработано 300 документов...
Обработано 400 документов...
Обработано 500 документов...
Обработано 600 документов...
Обработано 700 документов...
Обработано 800 документов...
Обработано 900 документов...
Обработано 1000 документов...
Обработано 1100 документов...
Обработано 1200 документов...
Обработано 1300 документов...
Обработано 1400 документов...
Обработано 1500 документов...
Обработано 1600 документов...
Обработано 1700 документов...
Обработано 1800 документов...
Обработано 1900 документов...
Обработано 2000 документов...
Обработано 2100 документов...
Обработано 2200 документов...
Обработано 2300 документов...
Обработано 2400 документов...
Обработано 12500 документов...
Обработано 12600 документов...
Обработано 12700 документов...
Обработано 12800 документов...
Обработано 12900 документов...
Обработано 13000 документов...
Обработано 13100 документов...
Обработано 13200 документов...
Обработано 133

In [14]:
create_annotation_files(df_test, 'test')

Обработано 0 документов...
Обработано 100 документов...
Обработано 200 документов...
Обработано 300 документов...
Обработано 400 документов...
Обработано 500 документов...
Обработано 600 документов...
Обработано 12500 документов...
Обработано 12600 документов...
Обработано 12700 документов...
Обработано 12800 документов...
Обработано 12900 документов...
Обработано 13000 документов...
Обработано 13100 документов...
Аннотации сохранены в ../assets/annotated-corpus/test
