In [1]:
import pandas as pd
import re
import os
import nltk
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lev\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Lev\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stop_words = set(stopwords.words('english'))

In [3]:
df = pd.read_csv('train.csv', header=None)

text_column = df.columns[2]
texts = df[text_column].astype(str).tolist()

In [4]:
def split_sentences_regex(text):
    pattern = r'\s*[!?.]+\s+'
    sentences = re.split(pattern, text)
    return [s.strip() for s in sentences if s.strip()]

In [14]:
def tokenize_with_entities(text):
    tokens = []
    
    patterns = {
        'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        
        'phone': r'\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b',
        
        'address': (
            r'\b\d+\s+[A-Z][a-z]+\s+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr)\b|'
            r'(?:Apt|Apartment|Suite|Ste)\.?\s*\d+[A-Z]?\b'
        )
    }
    
    combined_pattern = re.compile(
        f'(?P<email>{patterns["email"]})|(?P<phone>{patterns["phone"]})|(?P<address>{patterns["address"]})',
        re.IGNORECASE
    )
    
    last_end = 0
    
    for match in combined_pattern.finditer(text):
        if match.start() > last_end:
            interim_text = text[last_end:match.start()]
            words = re.findall(r'\b\w+\b', interim_text.lower())
            for word in words:
                tokens.append({'token': word, 'type': 'word'})
        
        entity_type = 'word'
        for group_name in ['email', 'phone', 'address']:
            if match.group(group_name):
                entity_type = group_name
                break
        
        tokens.append({
            'token': match.group().lower() if entity_type in ['word', 'email', 'phone'] else match.group(),
            'type': entity_type
        })
        last_end = match.end()
    
    if last_end < len(text):
        interim_text = text[last_end:]
        words = re.findall(r'\b\w+\b', interim_text.lower())
        for word in words:
            tokens.append({'token': word, 'type': 'word'})
    
    return tokens

In [6]:
def stemming(token_objects):
    return [
        stemmer.stem(item['token']) if item['type'] == 'word' else item['token']
        for item in token_objects
    ]

def lemmatization(token_objects):
    return [
        lemmatizer.lemmatize(item['token']) if item['type'] == 'word' else item['token']
        for item in token_objects
    ]

In [9]:
def generate_tsv_annotation(text, remove_stopwords=True):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    sentences = split_sentences_regex(text)
    sentence_annotations = []
    
    for sentence in sentences:
        token_objects = tokenize_with_entities(sentence)
        
        if remove_stopwords:
            token_objects = [
                item for item in token_objects 
                if not (item['type'] == 'word' and item['token'] in stop_words)
            ]
        
        if not token_objects:
            continue
            
        stemmed = stemming(token_objects)
        lemmatized = lemmatization(token_objects)
        
        token_annotations = [
            f"{obj['token']}\t{stem}\t{lemma}"
            for obj, stem, lemma in zip(token_objects, stemmed, lemmatized)
        ]
        
        sentence_annotations.append('\n'.join(token_annotations))
    
    return '\n\n'.join(sentence_annotations)


def process_dataset_to_tsv(df, text_column_index=2, output_dir='assets', remove_stopwords=True):
    os.makedirs(output_dir, exist_ok=True)
    
    text_column = df.columns[text_column_index]
    texts = df[text_column].astype(str).tolist()
    
    for idx, text in enumerate(texts):
        tsv_content = generate_tsv_annotation(text, remove_stopwords=remove_stopwords)
        
        filename = os.path.join(output_dir, f"{idx+1}.tsv")
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(tsv_content)
        
        if (idx + 1) % 100 == 0:
            print(f"{idx + 1}/{len(texts)}")
    
    print(f"{len(texts)} файлов сохранено в {output_dir}")

In [10]:
process_dataset_to_tsv(df, text_column_index=2, output_dir='assets', remove_stopwords=False)

100/120000
200/120000
300/120000
400/120000
500/120000
600/120000
700/120000
800/120000
900/120000
1000/120000
1100/120000
1200/120000
1300/120000
1400/120000
1500/120000
1600/120000
1700/120000
1800/120000
1900/120000
2000/120000
2100/120000
2200/120000
2300/120000
2400/120000
2500/120000
2600/120000
2700/120000
2800/120000
2900/120000
3000/120000
3100/120000
3200/120000
3300/120000
3400/120000
3500/120000
3600/120000
3700/120000
3800/120000
3900/120000
4000/120000
4100/120000
4200/120000
4300/120000
4400/120000
4500/120000
4600/120000
4700/120000
4800/120000
4900/120000
5000/120000
5100/120000
5200/120000
5300/120000
5400/120000
5500/120000
5600/120000
5700/120000
5800/120000
5900/120000
6000/120000
6100/120000
6200/120000
6300/120000
6400/120000
6500/120000
6600/120000
6700/120000
6800/120000
6900/120000
7000/120000
7100/120000
7200/120000
7300/120000
7400/120000
7500/120000
7600/120000
7700/120000
7800/120000
7900/120000
8000/120000
8100/120000
8200/120000
8300/120000
8400/120000
8

In [21]:
text = '''Alternative phone: 8(918)3213412. Write to abc@abc.com.
NY office: 123 Fifth Avenue, Apt 45. Call +1 (555) 123-4567!
'''
sentences = split_sentences_regex(text)
sentence_annotations = []

for sentence in sentences:
    token_objects = tokenize_with_entities(sentence)
    print(token_objects)

[{'token': 'alternative', 'type': 'word'}, {'token': 'phone', 'type': 'word'}, {'token': '8(918)3213412', 'type': 'phone'}]
[{'token': 'write', 'type': 'word'}, {'token': 'to', 'type': 'word'}, {'token': 'abc@abc.com', 'type': 'email'}]
[{'token': 'ny', 'type': 'word'}, {'token': 'office', 'type': 'word'}, {'token': '123 Fifth Avenue', 'type': 'address'}, {'token': 'Apt 45', 'type': 'address'}]
[{'token': 'call', 'type': 'word'}, {'token': '1 (555) 123-4567', 'type': 'phone'}]


In [25]:
test_cases = [
    "He banks at the local bank.",
    "The leaves leave the tree.",
    "I saw a saw in the woods.",
    "The fly can fly away.",
    "He wound the bandage around the wound.",
    "She is running fast. The running water is cold.",
    "He draws a drawing every day.",
    "The dogs dog the cat.",
    "The mice mouse around quietly.",
]

for idx, sentence in enumerate(test_cases, 1):
    tokens = tokenize_with_entities(sentence)
    
    lemmas = lemmatization(tokens)
    
    key_words = {'banks','bank','leaves','leave','saw','fly','wound','running','draws','drawing',
                 'dogs','dog','mice','mouse'}
    
    for tok_obj, lemma in zip(tokens, lemmas):
        token = tok_obj['token']
        if token in key_words:
            lemma_as_verb = lemmatizer.lemmatize(token, pos='v')
            lemma_as_noun = lemmatizer.lemmatize(token, pos='n')
            
            if lemma_as_verb != lemma_as_noun:
                print(sentence)
                print(f"{token} -> {lemma}")
                print(f"Как глагол: {lemma_as_verb}, как существительное: {lemma_as_noun}\n")

The leaves leave the tree.
leaves -> leaf
Как глагол: leave, как существительное: leaf

He wound the bandage around the wound.
wound -> wound
Как глагол: wind, как существительное: wound

He wound the bandage around the wound.
wound -> wound
Как глагол: wind, как существительное: wound

She is running fast. The running water is cold.
running -> running
Как глагол: run, как существительное: running

She is running fast. The running water is cold.
running -> running
Как глагол: run, как существительное: running

He draws a drawing every day.
drawing -> drawing
Как глагол: draw, как существительное: drawing

The mice mouse around quietly.
mice -> mouse
Как глагол: mice, как существительное: mouse

