In [2]:
!pip install pymorphy3 nltk scikit-learn torch umap-learn matplotlib



In [3]:
# !unzip 'archive (4).zip'

In [4]:
import os
# from pymorphy3 import MorphAnalyzer
# import re
# import nltk

# nltk.download('stopwords')  # Скачиваем стоп-слова для всех языков
# nltk.download('punkt')      # Токенизатор (может потребоваться)

In [2]:
from nltk.corpus import stopwords
print(stopwords.words('russian')[:10])  # Пример: ['и', 'в', 'во', 'не', 'что', ...]


['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со']


In [5]:
# morph = MorphAnalyzer()

def load_texts_from_folder(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
            texts.append(f.read())
    return texts

# Загрузка данных
pos_texts = load_texts_from_folder("dataset/pos")
neg_texts = load_texts_from_folder("dataset/neg")
neut_texts = load_texts_from_folder("dataset/neu")

# Объединение и маркировка
texts = pos_texts + neg_texts + neut_texts
labels = [1] * len(pos_texts) + [0] * len(neg_texts) + [2] * len(neut_texts)  # 1=pos, 0=neg, 2=neut

In [None]:
import re
import concurrent.futures
from pymorphy3 import MorphAnalyzer
from nltk.corpus import stopwords

# Инициализация объектов
morph = MorphAnalyzer()
stop_words = set(stopwords.words('russian'))

def preprocess(text):
    # Удаление пунктуации с использованием str.translate
    text = text.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'))
    text = text.lower()  # Приведение к нижнему регистру
    words = text.split()

    # Лемматизация с использованием множества для ускорения
    words = [morph.parse(word)[0].normal_form for word in words if word not in stop_words]

    return " ".join(words)

def preprocess_texts(texts):
    """Параллельная обработка всех текстов"""
    with concurrent.futures.ThreadPoolExecutor() as executor:
        return list(executor.map(preprocess, texts))

# Пример использования:
processed_texts = preprocess_texts(texts)


In [None]:
# Сохранение processed_texts в файл (каждый текст на новой строке)
with open('processed_texts.txt', 'w', encoding='utf-8') as f:
    for text in processed_texts:
        f.write(text + '\n')  # Добавляем перенос строки

In [6]:
# 1. Загрузка предобработанных текстов из файла
def load_processed_texts(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        processed_texts = [line.strip() for line in f.readlines()]
    return processed_texts

# Укажите путь к вашему файлу
processed_texts = load_processed_texts('processed_texts (1).txt')

In [7]:
import re
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [8]:
vectorizer = CountVectorizer(
    max_features=5000,
    token_pattern=r'(?u)\b\w+\b',
    ngram_range=(1, 1)
)
X = vectorizer.fit_transform(processed_texts).toarray()
print(f"Размерность векторных представлений: {X.shape}")

Размерность векторных представлений: (131669, 5000)


In [9]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, embedding_dim=500):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, embedding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded



In [10]:
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train = torch.FloatTensor(X_train).to(device)
X_test = torch.FloatTensor(X_test).to(device)

In [11]:
input_dim = X.shape[1]
model = Autoencoder(input_dim, embedding_dim=500).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 10
batch_size = 32

In [12]:
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for i in range(0, len(X_train), batch_size):
        batch = X_train[i:i+batch_size]
        optimizer.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    with torch.no_grad():
        test_loss = criterion(model(X_test), X_test).item()

    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(X_train):.4f}, Test Loss: {test_loss:.4f}')


Epoch 1/10, Train Loss: 0.0020, Test Loss: 0.0646
Epoch 2/10, Train Loss: 0.0020, Test Loss: 0.0645
Epoch 3/10, Train Loss: 0.0019, Test Loss: 0.0554
Epoch 4/10, Train Loss: 0.0017, Test Loss: 0.0535
Epoch 5/10, Train Loss: 0.0017, Test Loss: 0.0527
Epoch 6/10, Train Loss: 0.0016, Test Loss: 0.0523
Epoch 7/10, Train Loss: 0.0016, Test Loss: 0.0520
Epoch 8/10, Train Loss: 0.0016, Test Loss: 0.0517
Epoch 9/10, Train Loss: 0.0016, Test Loss: 0.0514
Epoch 10/10, Train Loss: 0.0016, Test Loss: 0.0511


In [13]:
model.eval()
with torch.no_grad():
    embeddings = model.encoder(torch.FloatTensor(X).to(device)).cpu().numpy()

print(f"Размерность эмбеддингов: {embeddings.shape}")

Размерность эмбеддингов: (131669, 500)


In [14]:
torch.save(model.state_dict(), 'autoencoder_model.pth')


In [15]:
with open('text_embeddings.pkl', 'wb') as f:
    pickle.dump({
        'embeddings': embeddings,
        'vocabulary': vectorizer.get_feature_names_out()
    }, f)

In [17]:
import json
import os

# Load the embeddings data
with open('text_embeddings.pkl', 'rb') as f:
    embeddings_data = pickle.load(f)

# Create a list to store the output data in the desired format
output_list = []

# Get all filenames in order (pos first, then neg, then neut)
filenames = []
for folder, label in [('pos', 'positive'), ('neg', 'negative'), ('neu', 'neutral')]:
    folder_path = f"dataset/{folder}"
    for filename in sorted(os.listdir(folder_path)):
        filenames.append({
            "filename": filename,
            "label": label
        })

# Verify we have the same number of files as embeddings
assert len(filenames) == len(embeddings_data['embeddings']), "Mismatch between files and embeddings"

# Create the output list
for i, (file_info, embedding) in enumerate(zip(filenames, embeddings_data['embeddings'])):
    output_list.append({
        "filename": file_info["filename"],
        "label": file_info["label"],
        "embedding": embedding.tolist()  # Convert numpy array to list
    })


with open('ae_embeddings.json', 'w', encoding='utf-8') as f:
    json.dump(output_list, f, ensure_ascii=False, indent=4)

print(f"Data saved in final_embeddings.json with {len(output_list)} entries")

Data saved in final_embeddings.json with 131669 entries


In [2]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import json

def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)  
    return pd.DataFrame(data)
df = load_data('ae_embeddings.json')
df.head()

Unnamed: 0,filename,label,embedding
0,1000023-0.txt,positive,"[-0.010793164372444153, 0.04824411869049072, 0..."
1,1000023-1.txt,positive,"[0.029604390263557434, 0.5547586679458618, 0.3..."
2,1000023-10.txt,positive,"[-0.11132340878248215, 0.21457207202911377, 0...."
3,1000023-2.txt,positive,"[0.2803587317466736, -0.07406213879585266, -0...."
4,1000023-4.txt,positive,"[-0.0559392049908638, 0.463557630777359, -0.20..."


In [9]:
X.shape

(131669, 500)

In [3]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
X = np.array(df['embedding'].tolist())
y = df['label_encoded'].values

In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [10]:
lr = LogisticRegression(max_iter=10)
lr.fit(X_train, y_train)
print("Logistic Regression Test Results:")
print(classification_report(y_test, lr.predict(X_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Test Results:
              precision    recall  f1-score   support

           0       0.65      0.51      0.57      2993
           1       0.44      0.16      0.24      3643
           2       0.77      0.94      0.85     13115

    accuracy                           0.73     19751
   macro avg       0.62      0.54      0.55     19751
weighted avg       0.69      0.73      0.69     19751



In [5]:
lr_balanced = LogisticRegression(max_iter=20, class_weight='balanced')
lr_balanced.fit(X_train, y_train)
print("Balanced Logistic Regression Test Results:")
print(classification_report(y_test, lr_balanced.predict(X_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Balanced Logistic Regression Test Results:
              precision    recall  f1-score   support

           0       0.52      0.72      0.61      2993
           1       0.33      0.47      0.39      3643
           2       0.89      0.71      0.79     13115

    accuracy                           0.66     19751
   macro avg       0.58      0.63      0.59     19751
weighted avg       0.73      0.66      0.69     19751



In [16]:
dt = DecisionTreeClassifier(
    random_state=42,
    max_depth=30,         
    min_samples_split=40,
    max_leaf_nodes=100      
)
dt.fit(X_train, y_train)
print("Decision Tree Test Results:")
print(classification_report(y_test, dt.predict(X_test)))

Decision Tree Test Results:
              precision    recall  f1-score   support

           0       0.45      0.23      0.31      2993
           1       0.33      0.01      0.02      3643
           2       0.70      0.96      0.81     13115

    accuracy                           0.68     19751
   macro avg       0.49      0.40      0.38     19751
weighted avg       0.59      0.68      0.59     19751



In [6]:
dt_balanced = DecisionTreeClassifier(random_state=42,
    max_depth=500,         
    min_samples_split=100,
    max_leaf_nodes=750,  
    class_weight='balanced')
dt_balanced.fit(X_train, y_train)
print("Balanced Decision Tree Test Results:")
print(classification_report(y_test, dt_balanced.predict(X_test)))

Balanced Decision Tree Test Results:
              precision    recall  f1-score   support

           0       0.30      0.49      0.37      2993
           1       0.23      0.43      0.30      3643
           2       0.83      0.49      0.61     13115

    accuracy                           0.48     19751
   macro avg       0.45      0.47      0.43     19751
weighted avg       0.64      0.48      0.52     19751



In [4]:
X = np.array(df['embedding'].tolist(), dtype=np.float32)  # Add dtype=np.float32
y = df['label_encoded'].values

In [5]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [6]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
test_dataset = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))

In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        return self.layers(x)

In [8]:
input_size = X_train.shape[1]
num_classes = len(le.classes_)
batch_size = 32
epochs = 10
learning_rate = 0.001

In [9]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [10]:
class_counts = np.bincount(y_train)
class_weights = 1. / class_counts
class_weights = torch.tensor(class_weights, dtype=torch.float32)
criterion = nn.CrossEntropyLoss(weight=class_weights)
model_balanced = NeuralNetwork(input_size, num_classes)
optimizer = optim.Adam(model_balanced.parameters(), learning_rate)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [22]:
model_balanced = model_balanced.to(device)
criterion = criterion.to(device)
optimizer = optim.Adam(model_balanced.parameters(), learning_rate)

In [23]:
for epoch in range(epochs):
    model_balanced.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model_balanced(inputs)  # Теперь всё на GPU
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    model_balanced.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Не забываем!
            
            outputs = model_balanced(inputs)
            val_loss += criterion(outputs, labels).item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
    
    val_acc = correct / len(val_dataset)
    print(f"Epoch {epoch+1}/{epochs} | Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.4f}")

Epoch 1/10 | Val Loss: 501.4197 | Val Accuracy: 0.6713
Epoch 2/10 | Val Loss: 504.1313 | Val Accuracy: 0.6861
Epoch 3/10 | Val Loss: 497.3885 | Val Accuracy: 0.6699
Epoch 4/10 | Val Loss: 495.4450 | Val Accuracy: 0.6686
Epoch 5/10 | Val Loss: 494.1007 | Val Accuracy: 0.6749
Epoch 6/10 | Val Loss: 493.9441 | Val Accuracy: 0.6699
Epoch 7/10 | Val Loss: 493.9346 | Val Accuracy: 0.6708
Epoch 8/10 | Val Loss: 495.8670 | Val Accuracy: 0.6610
Epoch 9/10 | Val Loss: 494.5276 | Val Accuracy: 0.6550
Epoch 10/10 | Val Loss: 496.8359 | Val Accuracy: 0.6388


In [25]:
model_balanced.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model_balanced(inputs)
        preds = outputs.argmax(dim=1)
        
        # Переносим тензоры на CPU перед преобразованием в numpy
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("\nNeural Network Test Results:")
print(classification_report(all_labels, all_preds))


Neural Network Test Results:
              precision    recall  f1-score   support

           0       0.50      0.75      0.60      2993
           1       0.31      0.47      0.38      3643
           2       0.90      0.67      0.77     13115

    accuracy                           0.65     19751
   macro avg       0.57      0.63      0.58     19751
weighted avg       0.73      0.65      0.67     19751



In [12]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import time
import numpy as np

class BidirectionalLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob=0.3):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size,
            hidden_size // 2,
            num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_prob if num_layers > 1 else 0
        )
        
        self.batch_norm = nn.BatchNorm1d(hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'weight_ih' in name:
                nn.init.xavier_normal_(param.data)
            elif 'weight_hh' in name:
                nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)

    def forward(self, x):
        x = x.unsqueeze(1)
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:, -1, :]
        out = self.batch_norm(out)
        out = self.fc(out)
        return out

def initialize_model(X_train, y_train, num_classes):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model_config = {
        'input_size': X_train.shape[1],
        'hidden_size': 128,
        'num_layers': 2,
        'output_size': num_classes,
        'dropout_prob': 0.4
    }
    
    model = BidirectionalLSTM(**model_config).to(device)
    
    training_config = {
        'learning_rate': 0.00001,
        'weight_decay': 1e-4,
        'clip_value': 0.5
    }
    
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=training_config['learning_rate'],
        weight_decay=training_config['weight_decay']
    )
    
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)
    
    return device, model, optimizer, criterion, scheduler

def train_model(model, train_loader, val_loader, optimizer, criterion, device, 
                scheduler=None, epochs=20, clip_value=0.5, patience=3):
    best_val_loss = float('inf')
    no_improve = 0
    
    for epoch in range(epochs):
        start_time = time.time()
        model.train()
        train_loss = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * inputs.size(0)
        
        model.eval()
        val_loss = 0
        correct = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()
                
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        train_loss /= len(train_loader.dataset)
        val_loss /= len(val_loader.dataset)
        val_acc = correct / len(val_loader.dataset)
        
        if scheduler:
            scheduler.step(val_loss)
        
        epoch_time = time.time() - start_time
        print(f'Epoch {epoch+1}/{epochs} | Time: {epoch_time:.2f}s')
        print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
        print(f'Val Accuracy: {val_acc:.4f}')
        print(classification_report(all_labels, all_preds, digits=4))
        print('-' * 60)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improve = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break

def evaluate_model(model, test_loader, device):
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    print("\nFinal Test Results:")
    print(classification_report(all_labels, all_preds, digits=4))
    return all_preds, all_labels



if __name__ == "__main__":
    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))
    test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

    batch_size = 32
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    device, model, optimizer, criterion, scheduler = initialize_model(
        X_train=X_train,
        y_train=y_train,
        num_classes=len(np.unique(y_train))
    )
    
    train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        scheduler=scheduler,
        epochs=20,
        clip_value=0.5,
        patience=5
    )
    
    test_preds, test_labels = evaluate_model(model, test_loader, device)


Epoch 1/20 | Time: 8.42s
Train Loss: 1.0330 | Val Loss: 0.8998
Val Accuracy: 0.6082
              precision    recall  f1-score   support

           0     0.3834    0.7432    0.5058      2963
           1     0.3074    0.3585    0.3310      3704
           2     0.8756    0.6482    0.7450     13083

    accuracy                         0.6082     19750
   macro avg     0.5221    0.5833    0.5273     19750
weighted avg     0.6952    0.6082    0.6314     19750

------------------------------------------------------------
Epoch 2/20 | Time: 9.38s
Train Loss: 0.9120 | Val Loss: 0.8533
Val Accuracy: 0.6252
              precision    recall  f1-score   support

           0     0.3977    0.7830    0.5275      2963
           1     0.3198    0.3539    0.3360      3704
           2     0.8879    0.6663    0.7613     13083

    accuracy                         0.6252     19750
   macro avg     0.5352    0.6011    0.5416     19750
weighted avg     0.7078    0.6252    0.6465     19750

---------