In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import json

In [2]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)  
    return pd.DataFrame(data)

df = load_data('/kaggle/input/kinopoisk-embeddings/embeddings.json')

In [3]:
df.head()

Unnamed: 0,filename,label,embedding
0,1000083-0.txt,negative,"[0.0019, 0.0049, 0.0015, 0.0018, 0.0018, 0.001..."
1,1000083-1.txt,negative,"[0.001, 0.0007, 0.0008, 0.0006, 0.0012, 0.0007..."
2,1000125-3.txt,negative,"[0.001, 0.0024, 0.0013, 0.0013, 0.0012, 0.0015..."
3,1000125-4.txt,negative,"[0.0, 0.0033, 0.0014, 0.0014, 0.0007, 0.0016, ..."
4,1000125-6.txt,negative,"[0.0004, 0.0026, 0.0004, 0.0016, 0.0036, 0.003..."


In [4]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

In [5]:
X = np.array(df['embedding'].tolist())
y = df['label_encoded'].values

In [10]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [9]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print("Logistic Regression Test Results:")
print(classification_report(y_test, lr.predict(X_test)))

Logistic Regression Test Results:
              precision    recall  f1-score   support

           0       0.76      0.18      0.30      2921
           1       0.58      0.02      0.04      3780
           2       0.69      1.00      0.81     13050

    accuracy                           0.69     19751
   macro avg       0.68      0.40      0.39     19751
weighted avg       0.68      0.69      0.59     19751



In [10]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
print("Decision Tree Test Results:")
print(classification_report(y_test, dt.predict(X_test)))

Decision Tree Test Results:
              precision    recall  f1-score   support

           0       0.30      0.29      0.29      2921
           1       0.24      0.24      0.24      3780
           2       0.73      0.73      0.73     13050

    accuracy                           0.57     19751
   macro avg       0.42      0.42      0.42     19751
weighted avg       0.57      0.57      0.57     19751



In [11]:
lr_balanced = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_balanced.fit(X_train, y_train)
print("Balanced Logistic Regression Test Results:")
print(classification_report(y_test, lr_balanced.predict(X_test)))

Balanced Logistic Regression Test Results:
              precision    recall  f1-score   support

           0       0.52      0.73      0.61      2921
           1       0.35      0.42      0.38      3780
           2       0.87      0.75      0.80     13050

    accuracy                           0.68     19751
   macro avg       0.58      0.63      0.60     19751
weighted avg       0.72      0.68      0.69     19751



In [12]:
dt_balanced = DecisionTreeClassifier(random_state=42, class_weight='balanced')
dt_balanced.fit(X_train, y_train)
print("Balanced Decision Tree Test Results:")
print(classification_report(y_test, dt_balanced.predict(X_test)))

Balanced Decision Tree Test Results:
              precision    recall  f1-score   support

           0       0.29      0.31      0.30      2921
           1       0.23      0.25      0.24      3780
           2       0.73      0.70      0.71     13050

    accuracy                           0.55     19751
   macro avg       0.42      0.42      0.42     19751
weighted avg       0.57      0.55      0.56     19751



In [None]:
X = np.array(df['embedding'].tolist(), dtype=np.float32)  # Add dtype=np.float32
y = df['label_encoded'].values

In [12]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [13]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
test_dataset = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))

In [19]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        return self.layers(x)

In [8]:
input_size = X_train.shape[1]
num_classes = len(le.classes_)
batch_size = 32
epochs = 20
learning_rate = 0.001

In [21]:
model = NeuralNetwork(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [23]:
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
    
    val_acc = correct / len(val_dataset)
    print(f"Epoch {epoch+1}/{epochs} | Val Accuracy: {val_acc:.4f}")

Epoch 1/20 | Val Accuracy: 0.7364
Epoch 2/20 | Val Accuracy: 0.7442
Epoch 3/20 | Val Accuracy: 0.7475
Epoch 4/20 | Val Accuracy: 0.7470
Epoch 5/20 | Val Accuracy: 0.7480
Epoch 6/20 | Val Accuracy: 0.7487
Epoch 7/20 | Val Accuracy: 0.7477
Epoch 8/20 | Val Accuracy: 0.7473
Epoch 9/20 | Val Accuracy: 0.7484
Epoch 10/20 | Val Accuracy: 0.7475
Epoch 11/20 | Val Accuracy: 0.7460
Epoch 12/20 | Val Accuracy: 0.7456
Epoch 13/20 | Val Accuracy: 0.7447
Epoch 14/20 | Val Accuracy: 0.7438
Epoch 15/20 | Val Accuracy: 0.7424
Epoch 16/20 | Val Accuracy: 0.7388
Epoch 17/20 | Val Accuracy: 0.7407
Epoch 18/20 | Val Accuracy: 0.7400
Epoch 19/20 | Val Accuracy: 0.7367
Epoch 20/20 | Val Accuracy: 0.7356


In [24]:
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        preds = outputs.argmax(dim=1)
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())

print("\nNeural Network Test Results:")
print(classification_report(all_labels, all_preds))


Neural Network Test Results:
              precision    recall  f1-score   support

           0       0.61      0.61      0.61      2921
           1       0.43      0.22      0.29      3780
           2       0.80      0.92      0.86     13050

    accuracy                           0.74     19751
   macro avg       0.61      0.58      0.58     19751
weighted avg       0.70      0.74      0.71     19751



In [32]:
class_counts = np.bincount(y_train)
class_weights = 1. / class_counts
class_weights = torch.tensor(class_weights, dtype=torch.float32)
criterion_b = nn.CrossEntropyLoss(weight=class_weights)
model_balanced = NeuralNetwork(input_size, num_classes)
optimizer = optim.Adam(model_balanced.parameters(), lr=0.00001)

In [33]:
for epoch in range(epochs):
    model_balanced.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model_balanced(inputs)
        loss = criterion_b(outputs, labels)
        loss.backward()
        optimizer.step()
    
    
    model_balanced.eval()
    val_loss = 0
    correct_balanced = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model_balanced(inputs)
            val_loss += criterion_b(outputs, labels).item()
            preds_balanced = outputs.argmax(dim=1)
            correct_balanced += (preds_balanced == labels).sum().item()
    
    val_acc = correct_balanced / len(val_dataset)
    print(f"Epoch {epoch+1}/{epochs} | Val Accuracy: {val_acc:.4f}")

Epoch 1/20 | Val Accuracy: 0.6595
Epoch 2/20 | Val Accuracy: 0.6595
Epoch 3/20 | Val Accuracy: 0.6595
Epoch 4/20 | Val Accuracy: 0.6596
Epoch 5/20 | Val Accuracy: 0.6838
Epoch 6/20 | Val Accuracy: 0.7185
Epoch 7/20 | Val Accuracy: 0.7139
Epoch 8/20 | Val Accuracy: 0.6977
Epoch 9/20 | Val Accuracy: 0.6905
Epoch 10/20 | Val Accuracy: 0.6892
Epoch 11/20 | Val Accuracy: 0.6920
Epoch 12/20 | Val Accuracy: 0.6848
Epoch 13/20 | Val Accuracy: 0.6892
Epoch 14/20 | Val Accuracy: 0.6866
Epoch 15/20 | Val Accuracy: 0.6870
Epoch 16/20 | Val Accuracy: 0.6905
Epoch 17/20 | Val Accuracy: 0.6884
Epoch 18/20 | Val Accuracy: 0.6883
Epoch 19/20 | Val Accuracy: 0.6790
Epoch 20/20 | Val Accuracy: 0.6855


In [34]:
model_balanced.eval()
all_preds_balanced = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model_balanced(inputs)
        preds_balanced = outputs.argmax(dim=1)
        all_preds_balanced.extend(preds_balanced.numpy())
        all_labels.extend(labels.numpy())

print("\nNeural Network Test Results:")
print(classification_report(all_labels, all_preds_balanced))


Neural Network Test Results:
              precision    recall  f1-score   support

           0       0.49      0.75      0.59      2921
           1       0.29      0.23      0.26      3780
           2       0.85      0.80      0.82     13050

    accuracy                           0.68     19751
   macro avg       0.54      0.59      0.56     19751
weighted avg       0.69      0.68      0.68     19751



In [29]:
print("\nModel Comparison:")
print(f"Logistic Regression Accuracy: {lr.score(X_test, y_test):.4f}")
print(f"Decision Tree Accuracy: {dt.score(X_test, y_test):.4f}")
print(f"Neural Network Accuracy: {(np.array(all_preds) == np.array(all_labels)).mean():.4f}") 


Model Comparison:
Logistic Regression Accuracy: 0.6894
Decision Tree Accuracy: 0.5690
Neural Network Accuracy: 0.7395


In [35]:
print("\nBalanced Model Comparison:")
print(f"Logistic Regression Accuracy: {lr_balanced.score(X_test, y_test):.4f}")
print(f"Decision Tree Accuracy: {dt_balanced.score(X_test, y_test):.4f}")
print(f"Neural Network Accuracy: {(np.array(all_preds_balanced) == np.array(all_labels)).mean():.4f}") 


Balanced Model Comparison:
Logistic Regression Accuracy: 0.6811
Decision Tree Accuracy: 0.5548
Neural Network Accuracy: 0.6832


Using Minority class oversampling

In [None]:
unique_classes, class_counts = np.unique(y_train, return_counts=True)
max_count = np.max(class_counts)

X_train_balanced = []
y_train_balanced = []

for cls in unique_classes:
    cls_indices = np.where(y_train == cls)[0]
    n_samples = max_count - len(cls_indices)
    if n_samples > 0:
        selected_indices = np.random.choice(cls_indices, size=n_samples, replace=True)
        X_train_balanced.append(X_train[selected_indices])
        y_train_balanced.append(y_train[selected_indices])
    X_train_balanced.append(X_train[cls_indices])
    y_train_balanced.append(y_train[cls_indices])

X_train_balanced = np.concatenate(X_train_balanced)
y_train_balanced = np.concatenate(y_train_balanced)
shuffle_idx = np.random.permutation(len(X_train_balanced))
X_train_balanced = X_train_balanced[shuffle_idx]
y_train_balanced = y_train_balanced[shuffle_idx]

In [56]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_balanced, y_train_balanced)
print("Logistic Regression Test Results:")
print(classification_report(y_test, lr.predict(X_test)))

Logistic Regression Test Results:
              precision    recall  f1-score   support

           0       0.53      0.73      0.61      2921
           1       0.35      0.44      0.39      3780
           2       0.88      0.74      0.80     13050

    accuracy                           0.68     19751
   macro avg       0.59      0.64      0.60     19751
weighted avg       0.73      0.68      0.70     19751



In [59]:
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')
dt.fit(X_train_balanced, y_train_balanced)
print("Balanced Decision Tree Test Results:")
print(classification_report(y_test, dt.predict(X_test)))

Balanced Decision Tree Test Results:
              precision    recall  f1-score   support

           0       0.27      0.28      0.28      2921
           1       0.22      0.24      0.23      3780
           2       0.72      0.70      0.71     13050

    accuracy                           0.55     19751
   macro avg       0.40      0.41      0.40     19751
weighted avg       0.56      0.55      0.55     19751



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report
import time
import numpy as np


class BidirectionalLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob=0.3):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

       
        self.lstm = nn.LSTM(
            input_size,
            hidden_size // 2, 
            num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_prob if num_layers > 1 else 0
        )
        
       
        self.batch_norm = nn.BatchNorm1d(hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
      
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'weight_ih' in name:
                nn.init.xavier_normal_(param.data)
            elif 'weight_hh' in name:
                nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)

    def forward(self, x):
        
        x = x.unsqueeze(1)
        
       
        lstm_out, _ = self.lstm(x) 
        
        
        out = lstm_out[:, -1, :]
        
       
        out = self.batch_norm(out)
        
   
        out = self.fc(out)
        return out


def initialize_model(X_train, num_classes):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model_config = {
        'input_size': X_train.shape[1],
        'hidden_size': 128,
        'num_layers': 2,
        'output_size': num_classes,
        'dropout_prob': 0.1
    }
    
    model = BidirectionalLSTM(**model_config).to(device)
    
    training_config = {
        'learning_rate': 0.0001,
        'weight_decay': 1e-4,
        'clip_value': 0.5
    }
    
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=training_config['learning_rate'],
        weight_decay=training_config['weight_decay']
    )
    
    criterion = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)
    
    return device, model, optimizer, criterion, scheduler


def train_model(model, train_loader, val_loader, optimizer, criterion, device, 
                scheduler=None, epochs=20, clip_value=0.5, patience=3):
    best_val_loss = float('inf')
    no_improve = 0
    
    for epoch in range(epochs):
        start_time = time.time()
        model.train()
        train_loss = 0
        
       
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * inputs.size(0)
        
       
        model.eval()
        val_loss = 0
        correct = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()
                
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
      
        train_loss /= len(train_loader.dataset)
        val_loss /= len(val_loader.dataset)
        val_acc = correct / len(val_loader.dataset)
        
   
        if scheduler:
            scheduler.step(val_loss)
        
       
        epoch_time = time.time() - start_time
        print(f'Epoch {epoch+1}/{epochs} | Time: {epoch_time:.2f}s')
        print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
        print(f'Val Accuracy: {val_acc:.4f}')
        print(classification_report(all_labels, all_preds, digits=4))
        print('-' * 60)
        
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            no_improve = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            no_improve += 1
            if no_improve >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break


def evaluate_model(model, test_loader, device):
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    print("\nFinal Test Results:")
    print(classification_report(all_labels, all_preds, digits=4))
    return all_preds, all_labels


if __name__ == "__main__":
    train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
    val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
    test_dataset = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))

  
    batch_size = 64
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    

    device, model, optimizer, criterion, scheduler = initialize_model(
        X_train=X_train,
        num_classes=len(np.unique(y_train)))
    
    train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        scheduler=scheduler,
        epochs=20,
        clip_value=0.5,
        patience=3
    )
    
    test_preds, test_labels = evaluate_model(model, test_loader, device)

Epoch 1/20 | Time: 7.06s
Train Loss: 0.7384 | Val Loss: 0.6366
Val Accuracy: 0.7417
              precision    recall  f1-score   support

           0     0.6162    0.6195    0.6179      3012
           1     0.4723    0.0918    0.1538      3713
           2     0.7776    0.9552    0.8573     13025

    accuracy                         0.7417     19750
   macro avg     0.6221    0.5555    0.5430     19750
weighted avg     0.6956    0.7417    0.6885     19750

------------------------------------------------------------
Epoch 2/20 | Time: 6.79s
Train Loss: 0.6164 | Val Loss: 0.6265
Val Accuracy: 0.7460
              precision    recall  f1-score   support

           0     0.6517    0.5963    0.6227      3012
           1     0.4687    0.1772    0.2572      3713
           2     0.7876    0.9427    0.8582     13025

    accuracy                         0.7460     19750
   macro avg     0.6360    0.5721    0.5794     19750
weighted avg     0.7069    0.7460    0.7093     19750

---------

  model.load_state_dict(torch.load('best_model.pth'))



Final Test Results:
              precision    recall  f1-score   support

           0     0.6360    0.6029    0.6190      2921
           1     0.4774    0.1735    0.2546      3780
           2     0.7881    0.9425    0.8584     13050

    accuracy                         0.7451     19751
   macro avg     0.6338    0.5730    0.5773     19751
weighted avg     0.7061    0.7451    0.7074     19751



In [None]:
class_counts = np.bincount(y_train)
class_weights = 1. / class_counts
class_weights = torch.tensor(class_weights, dtype=torch.float32)