In [21]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Проверка доступности GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPU devices found: {gpus}")
else:
    print("GPU not found. Running on CPU.")

# Создаем токенизатор (русский Rubert)
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^\w\sа-яё]', '', text, flags=re.UNICODE)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = tokenizer.tokenize(text)
    return tokens

def create_training_samples(tokens):
    if len(tokens) < 2:
        return None
    X = tokens[:-1]
    Y = tokens[1:]
    return X, Y

def process_file(input_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        texts = f.readlines()
    samples = []
    for text in texts:
        tokens = clean_and_tokenize(text)
        pair = create_training_samples(tokens)
        if pair:
            X, Y = pair
            samples.append({'X': X, 'Y': Y})
    return samples

def tokens_to_indices(samples, vocab):
    X_indices = []
    Y_indices = []
    for sample in samples:
        x_idx = [vocab.get(token, 0) for token in sample['X']]
        y_idx = [vocab.get(token, 0) for token in sample['Y']]
        X_indices.append(x_idx)
        Y_indices.append(y_idx)
    return X_indices, Y_indices

input_file = '/home/assistant/text-autocomplete/data/tweets.txt'

samples = process_file(input_file)

# Создаем словарь токенов
all_tokens = [token for sample in samples for token in sample['X']] + [token for sample in samples for token in sample['Y']]
vocab = {token: idx+1 for idx, token in enumerate(sorted(set(all_tokens)))}  # 0 - паддинг

X_indices, Y_indices = tokens_to_indices(samples, vocab)
max_len = max(len(x) for x in X_indices)
X_padded = pad_sequences(X_indices, maxlen=max_len, padding='post')
Y_padded = pad_sequences(Y_indices, maxlen=max_len, padding='post')

num_classes = len(vocab) + 1
Y_categorical = np.array([to_categorical(seq, num_classes=num_classes) for seq in Y_padded])

# Сплит на train/val/test
X_train, X_temp, Y_train, Y_temp = train_test_split(X_padded, Y_categorical, test_size=0.2, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

embedding_dim = 128

# Построение модели
model = Sequential([
    Embedding(input_dim=num_classes, output_dim=embedding_dim, input_length=max_len),
    LSTM(256, return_sequences=True),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Обучение
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=64)

# Функция генерации текста с модели
def generate_text(seed_text, max_length=20):
    tokens = tokenizer.tokenize(seed_text.lower())
    for _ in range(max_length):
        x_idx = [vocab.get(token, 0) for token in tokens]
        x_padded = pad_sequences([x_idx], maxlen=max_len, padding='post')
        preds = model.predict(x_padded)[0, len(tokens) - 1]  # предсказание для последнего токена
        next_idx = np.argmax(preds)
        next_token = next((tok for tok, idx in vocab.items() if idx == next_idx), None)
        if next_token is None or next_token == '[SEP]':
            break
        tokens.append(next_token)
        if len(tokens) >= max_length:
            break
    return ' '.join(tokens)

# Пример генерации
print(generate_text("Привет, как дела"))


2025-09-11 23:47:05.263746: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU devices found: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
