In [7]:
import os
import re
import numpy as np
import pandas as pd
# Removed gensim import - implementing Word2Vec from scratch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import time
from collections import Counter, defaultdict

# Note: tf_keras imports removed - using custom tokenization instead
try:
    import kagglehub
    KAGGLE_AVAILABLE = True
except ImportError:
    KAGGLE_AVAILABLE = False
    print("Warning: kagglehub not available, will use dummy data for demonstration")

# Ukrywamy denerwujące komunikaty
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# --- USTAWIANIE ZMIENNYCH ---
MAX_SEQUENCE_LENGTH = 300   # Max długość recenzji
EMBEDDING_DIM = 100         # Wymiar Word2Vec. Ma być 100
W2V_MIN_COUNT = 5           # Olewamy słowa, które są rzadkie

LSTM_HIDDEN_DIM = 128
NUM_CLASSES = 2             # Klasy: pozytywna (1) lub negatywna (0)
NUM_LAYERS = 2
DROPOUT = 0.5
BIDIRECTIONAL = True        # Używamy LSTM dwukierunkowego
LEARNING_RATE = 0.001
EPOCHS = 5
BATCH_SIZE = 64
TEST_SIZE = 0.2


# --- KLASY MODELU PYTORCH ---

class EmbeddingClassifier(nn.Module):
    # Model LSTM z polecenia
    def __init__(self, embedding_dim, lstm_hidden_dim, num_layers, num_classes, dropout=0.3, bidirectional=False):
        super(EmbeddingClassifier, self).__init__()

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=lstm_hidden_dim, num_layers=num_layers,
                             batch_first=True, dropout=dropout, bidirectional=bidirectional)

        self.bidirectional = bidirectional
        direction_factor = 2 if bidirectional else 1

        # Warstwa do podjęcia ostatecznej decyzji
        self.fc = nn.Linear(lstm_hidden_dim * direction_factor, num_classes)

    def forward(self, x):

        output, (hidden, cell) = self.lstm(x)

        # Bierzemy ostatni stan (Hidden State)
        if self.bidirectional:
            # Łączymy stany z przodu i z tyłu
            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        else:
            hidden = hidden[-1, :, :]

        out = self.fc(hidden)
        return out

class TextEmbeddingDataset(Dataset):
    # Klasa, która zamienia ID na wektory Word2Vec
    def __init__(self, sequences_padded, labels, embedding_matrix):
        self.sequences = sequences_padded
        self.labels = torch.LongTensor(labels)
        self.embedding_matrix = torch.from_numpy(embedding_matrix)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        indices = self.sequences[idx]

        # Zamieniamy ID słów na wektory Word2Vec
        embeddings = torch.index_select(self.embedding_matrix, 0, torch.LongTensor(indices))

        return embeddings, self.labels[idx]


# --- KROK 1: POBIERANIE DANYCH ---

print("Pobieramy IMDb z Kaggle Hub...")
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

csv_file_path = os.path.join(path, "IMDB Dataset.csv")
    df = pd.read_csv(csv_file_path)
    print("Wczytane.")
else:
    # Dummy data for demonstration if kagglehub not available
    print("Using dummy data (kagglehub not available)")
    df = pd.DataFrame({
        'review': ['great movie', 'terrible film'] * 100,
        'sentiment': ['positive', 'negative'] * 100
    })

all_reviews_text = df['review'].tolist()
# Zamieniamy "positive"/"negative" na 1/0
all_labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

# Dzielimy dane na treningowe i testowe
X_train_text, X_test_text, Y_train, Y_test = train_test_split(
    all_reviews_text, all_labels, test_size=TEST_SIZE, random_state=42, stratify=all_labels
)


# --- KROK 2: NORMALIZACJA I TRENOWANIE WORD2VEC (PRACA DOMOWA) ---

# Normalizacja: usuwamy HTML i znaki, wszystko małe litery
tokenized_corpus = [
    re.sub(r'[^a-z\s]', '', re.sub(r'<br />', ' ', review).lower()).split()
    for review in all_reviews_text
]

print("Zaczynamy Word2Vec...")
start_w2v_time = time.time()

# Trenowanie Word2Vec (custom implementation)
w2v_model = SimpleWord2Vec(
    vector_size=EMBEDDING_DIM,
    window=5,
    min_count=W2V_MIN_COUNT,
    sg=1  # Skip-gram
)
w2v_model.fit(tokenized_corpus, vector_size=EMBEDDING_DIM, window=5, min_count=W2V_MIN_COUNT, workers=4, sg=1)

end_w2v_time = time.time()
print(f"Word2Vec skończony. Wymiar: {EMBEDDING_DIM}, Czas: {end_w2v_time - start_w2v_time:.2f}s")


# --- KROK 3: PRZYGOTOWANIE DO PYTORCH ---

# Tworzymy słownik ID słów
keras_tokenizer = Tokenizer()
keras_tokenizer.fit_on_texts(all_reviews_text)

word_index = keras_tokenizer.word_index
VOCAB_SIZE = len(word_index) + 1

# Tworzymy Macierz Embeddingów
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM), dtype=np.float32)
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word] # Kopiujemy wektor Word2Vec

# Zamieniamy słowa na ich ID (sekwencjonowanie)
X_train_sequences = keras_tokenizer.texts_to_sequences(X_train_text)
X_test_sequences = keras_tokenizer.texts_to_sequences(X_test_text)

# Wyrównanie długości (Padding)
X_train_padded = pad_sequences(X_train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Czy jest GPU
pin_memory = torch.cuda.is_available()

# Tworzymy DataLoadery dla PyTorcha
train_dataset = TextEmbeddingDataset(X_train_padded, Y_train, embedding_matrix)
test_dataset = TextEmbeddingDataset(X_test_padded, Y_test, embedding_matrix)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=pin_memory)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=pin_memory)


# --- KROK 4: FUNKCJE TRENINGU ---

def train_model(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    if device.type == 'cuda':
        torch.cuda.empty_cache() # Dla GPU

    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad() # Zerujemy gradienty
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward() # Wsteczna propagacja
        optimizer.step() # Aktualizacja wag

        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy

def evaluate_model(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad(): # Wyłączamy liczenie gradientów
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy


# --- KROK 5: TRENING W PĘTLI ---

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Używamy: {device}")

# Tworzymy model LSTM
model = EmbeddingClassifier(
    embedding_dim=EMBEDDING_DIM,
    lstm_hidden_dim=LSTM_HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    num_classes=NUM_CLASSES,
    dropout=DROPOUT,
    bidirectional=BIDIRECTIONAL
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print("\nStart treningu PyTorch LSTM...")
start_lstm_time = time.time()

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train_loss, train_acc = train_model(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = evaluate_model(model, test_loader, criterion, device)

    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    print(f"Epoka {epoch}/{EPOCHS} (Czas: {epoch_duration:.2f} s)")
    print(f"  Trening: Loss={train_loss:.4f}, Acc={train_acc:.2f}%")
    print(f"  Test:    Loss={test_loss:.4f}, Acc={test_acc:.2f}%")

end_lstm_time = time.time()

print("Koniec treningu.")

# --- KROK 6: ZAPISANIE WAG ---

SAVE_PATH = 'imdb_sentiment_model_weights.pth'
torch.save(model.state_dict(), SAVE_PATH)
print(f"\nWagi zapisane: {SAVE_PATH}")
print(f"\nCałkowity czas Word2Vec: {end_w2v_time - start_w2v_time:.2f}s")
print(f"Całkowity czas treningu LSTM: {end_lstm_time - start_lstm_time:.2f}s")

IndentationError: unexpected indent (2479242846.py, line 96)