In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import json

In [2]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)  
    return pd.DataFrame(data)

df = load_data('/kaggle/input/kinopoisk-embeddings/embeddings.json')

In [3]:
df.head()

Unnamed: 0,filename,label,embedding
0,1000083-0.txt,negative,"[0.0019, 0.0049, 0.0015, 0.0018, 0.0018, 0.001..."
1,1000083-1.txt,negative,"[0.001, 0.0007, 0.0008, 0.0006, 0.0012, 0.0007..."
2,1000125-3.txt,negative,"[0.001, 0.0024, 0.0013, 0.0013, 0.0012, 0.0015..."
3,1000125-4.txt,negative,"[0.0, 0.0033, 0.0014, 0.0014, 0.0007, 0.0016, ..."
4,1000125-6.txt,negative,"[0.0004, 0.0026, 0.0004, 0.0016, 0.0036, 0.003..."


In [4]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

In [5]:
X = np.array(df['embedding'].tolist())
y = df['label_encoded'].values

In [6]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [9]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
print("Logistic Regression Test Results:")
print(classification_report(y_test, lr.predict(X_test)))

Logistic Regression Test Results:
              precision    recall  f1-score   support

           0       0.71      0.20      0.31      2921
           1       0.66      0.01      0.02      3780
           2       0.69      1.00      0.81     13050

    accuracy                           0.69     19751
   macro avg       0.69      0.40      0.38     19751
weighted avg       0.69      0.69      0.59     19751



In [10]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
print("Decision Tree Test Results:")
print(classification_report(y_test, dt.predict(X_test)))

Decision Tree Test Results:
              precision    recall  f1-score   support

           0       0.30      0.29      0.29      2921
           1       0.24      0.24      0.24      3780
           2       0.73      0.73      0.73     13050

    accuracy                           0.57     19751
   macro avg       0.42      0.42      0.42     19751
weighted avg       0.57      0.57      0.57     19751



In [None]:
X = np.array(df['embedding'].tolist(), dtype=np.float32)  # Add dtype=np.float32
y = df['label_encoded'].values

In [15]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [16]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
test_dataset = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))

In [23]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        return self.layers(x)

In [24]:
input_size = X_train.shape[1]
num_classes = len(le.classes_)
batch_size = 32
epochs = 20
learning_rate = 0.001

In [25]:
model = NeuralNetwork(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [26]:
for epoch in range(epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
    
    val_acc = correct / len(val_dataset)
    print(f"Epoch {epoch+1}/{epochs} | Val Accuracy: {val_acc:.4f}")

Epoch 1/20 | Val Accuracy: 0.7364
Epoch 2/20 | Val Accuracy: 0.7393
Epoch 3/20 | Val Accuracy: 0.7426
Epoch 4/20 | Val Accuracy: 0.7482
Epoch 5/20 | Val Accuracy: 0.7504
Epoch 6/20 | Val Accuracy: 0.7506
Epoch 7/20 | Val Accuracy: 0.7485
Epoch 8/20 | Val Accuracy: 0.7471
Epoch 9/20 | Val Accuracy: 0.7476
Epoch 10/20 | Val Accuracy: 0.7469
Epoch 11/20 | Val Accuracy: 0.7445
Epoch 12/20 | Val Accuracy: 0.7427
Epoch 13/20 | Val Accuracy: 0.7444
Epoch 14/20 | Val Accuracy: 0.7397
Epoch 15/20 | Val Accuracy: 0.7419
Epoch 16/20 | Val Accuracy: 0.7394
Epoch 17/20 | Val Accuracy: 0.7388
Epoch 18/20 | Val Accuracy: 0.7366
Epoch 19/20 | Val Accuracy: 0.7367
Epoch 20/20 | Val Accuracy: 0.7333


In [27]:
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        preds = outputs.argmax(dim=1)
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())

print("\nNeural Network Test Results:")
print(classification_report(all_labels, all_preds))


Neural Network Test Results:
              precision    recall  f1-score   support

           0       0.61      0.59      0.60      2921
           1       0.41      0.23      0.30      3780
           2       0.81      0.91      0.86     13050

    accuracy                           0.73     19751
   macro avg       0.61      0.58      0.58     19751
weighted avg       0.70      0.73      0.71     19751



In [11]:
print("\nModel Comparison:")
print(f"Logistic Regression Accuracy: {lr.score(X_test, y_test):.4f}")
print(f"Decision Tree Accuracy: {dt.score(X_test, y_test):.4f}")


Model Comparison:
Logistic Regression Accuracy: 0.6888
Decision Tree Accuracy: 0.5690


In [None]:
print("\nNeural Network Test Results:")
print(classification_report(all_labels, all_preds))
print(f"Neural Network Accuracy: {(np.array(all_preds) == np.array(all_labels)).mean():.4f}")  # Add this line


Neural Network Test Results:
              precision    recall  f1-score   support

           0       0.61      0.59      0.60      2921
           1       0.41      0.23      0.30      3780
           2       0.81      0.91      0.86     13050

    accuracy                           0.73     19751
   macro avg       0.61      0.58      0.58     19751
weighted avg       0.70      0.73      0.71     19751

Neural Network Accuracy: 0.7341
