In [1]:
print(2)

2


In [21]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import os

In [22]:
data = pd.read_csv('resume.csv')

# Text preprocessing
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

data['isi'] = data['isi'].apply(clean_text)
data['Hasil Ringkasan'] = data['Hasil Ringkasan'].apply(clean_text)

In [23]:
data = data.dropna(subset=['isi', 'Hasil Ringkasan'])

# Define maximum lengths
max_article_length = 200
max_summary_length = 50

In [24]:
class Vocab:
    def __init__(self, texts, max_size=10000):
        self.word2idx = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
        self.idx2word = {0: '<pad>', 1: '<sos>', 2: '<eos>', 3: '<unk>'}
        self.build_vocab(texts, max_size)

    def build_vocab(self, texts, max_size):
        word_counts = Counter()
        for text in texts:
            word_counts.update(text.split())
        for word, _ in word_counts.most_common(max_size - 4):
            idx = len(self.word2idx)
            self.word2idx[word] = idx
            self.idx2word[idx] = word

    def __len__(self):
        return len(self.word2idx)

In [25]:
texts = data['isi'].tolist() + data['Hasil Ringkasan'].tolist()
vocab = Vocab(texts)

In [26]:
def tokenize(text, vocab, max_length):
    tokens = [vocab.word2idx.get(word, vocab.word2idx['<unk>']) for word in text.split()]
    tokens = [vocab.word2idx['<sos>']] + tokens + [vocab.word2idx['<eos>']]
    if len(tokens) < max_length:
        tokens += [vocab.word2idx['<pad>']] * (max_length - len(tokens))
    else:
        tokens = tokens[:max_length]
    return tokens

data['article_tokens'] = data['isi'].apply(lambda x: tokenize(x, vocab, max_article_length))
data['summary_tokens'] = data['Hasil Ringkasan'].apply(lambda x: tokenize(x, vocab, max_summary_length))

In [27]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [28]:
class SummarizationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        article = self.data.iloc[idx]['article_tokens']
        summary = self.data.iloc[idx]['summary_tokens']
        return torch.tensor(article), torch.tensor(summary)

In [29]:
train_dataset = SummarizationDataset(train_data)
val_dataset = SummarizationDataset(val_data)

In [30]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [31]:
train_dataset

<__main__.SummarizationDataset at 0x7e32bbe5e530>

In [32]:
def load_glove_embeddings(file_path, vocab, embedding_dim):
    embeddings = np.zeros((len(vocab), embedding_dim))
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in vocab.word2idx:
                idx = vocab.word2idx[word]
                embeddings[idx] = np.array(vector, dtype=np.float32)
    return torch.tensor(embeddings, dtype=torch.float32)

embedding_dim = 200
glove_embeddings = load_glove_embeddings('/content/vectors_200d.txt', vocab, embedding_dim)

In [33]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_size, embedding_dim, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output)
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1]
        target_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)

        hidden, cell = self.encoder(source)

        input = target[:, 0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(input.unsqueeze(1), hidden, cell)
            outputs[:, t] = output.squeeze(1)
            teacher_force = np.random.rand() < teacher_forcing_ratio
            top1 = output.argmax(2)
            input = target[:, t] if teacher_force else top1.squeeze(1)

        return outputs

In [34]:
input_size = len(vocab)
hidden_size = 256
output_size = len(vocab)

encoder = Encoder(input_size, embedding_dim, hidden_size)
decoder = Decoder(output_size, embedding_dim, hidden_size)
model = Seq2Seq(encoder, decoder, torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
model = model.to(model.device)

In [35]:
model.encoder.embedding.weight.data.copy_(glove_embeddings)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=0.00001)

In [36]:
def train(model, dataloader, criterion, optimizer):
    model.train()
    epoch_loss = 0
    for source, target in dataloader:
        source, target = source.to(model.device), target.to(model.device)

        optimizer.zero_grad()

        output = model(source, target)

        output = output[:, 1:].contiguous().view(-1, output.shape[-1])
        target = target[:, 1:].contiguous().view(-1)

        loss = criterion(output, target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

# Evaluation Loop
def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for source, target in dataloader:
            source, target = source.to(model.device), target.to(model.device)

            output = model(source, target, teacher_forcing_ratio=0)

            output = output[:, 1:].contiguous().view(-1, output.shape[-1])
            target = target[:, 1:].contiguous().view(-1)

            loss = criterion(output, target)

            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

In [38]:
num_epochs = 1000
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, criterion, optimizer)
    val_loss = evaluate(model, val_loader, criterion)
    print(f'Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

Epoch: 1, Train Loss: 9.1993, Val Loss: 9.2071
Epoch: 2, Train Loss: 9.1995, Val Loss: 9.2067
Epoch: 3, Train Loss: 9.1977, Val Loss: 9.2064
Epoch: 4, Train Loss: 9.1965, Val Loss: 9.2060
Epoch: 5, Train Loss: 9.1967, Val Loss: 9.2056
Epoch: 6, Train Loss: 9.1971, Val Loss: 9.2053
Epoch: 7, Train Loss: 9.1958, Val Loss: 9.2097
Epoch: 8, Train Loss: 9.1954, Val Loss: 9.2096
Epoch: 9, Train Loss: 9.1938, Val Loss: 9.2094
Epoch: 10, Train Loss: 9.1939, Val Loss: 9.2063
Epoch: 11, Train Loss: 9.1935, Val Loss: 9.2064
Epoch: 12, Train Loss: 9.1915, Val Loss: 9.2027
Epoch: 13, Train Loss: 9.1907, Val Loss: 9.2027
Epoch: 14, Train Loss: 9.1902, Val Loss: 9.2020
Epoch: 15, Train Loss: 9.1915, Val Loss: 9.2031
Epoch: 16, Train Loss: 9.1911, Val Loss: 9.2002
Epoch: 17, Train Loss: 9.1893, Val Loss: 9.1985
Epoch: 18, Train Loss: 9.1902, Val Loss: 9.1993
Epoch: 19, Train Loss: 9.1884, Val Loss: 9.2000
Epoch: 20, Train Loss: 9.1870, Val Loss: 9.1999
Epoch: 21, Train Loss: 9.1850, Val Loss: 9.1994
E

KeyboardInterrupt: 

# TESTING THE GENERATION SUMMARIZATION

In [39]:
import torch
import torch.nn.functional as F

def generate_summary(
    model,
    article,
    vocab,
    max_summary_length,
    temperature=1.0,
    top_k=5,
    top_p=0.9,
    repetition_penalty=1.2
):
    model.eval()
    with torch.no_grad():
        # Prepare the input article
        article = torch.tensor(article).unsqueeze(0).to(model.device)
        hidden, cell = model.encoder(article)

        # Initialize the decoding process
        input = torch.tensor([vocab.word2idx['<sos>']]).unsqueeze(0).to(model.device)
        summary = []
        token_counts = {}  # Track token frequencies to penalize repetitions

        for step in range(max_summary_length):
            # Decoder forward pass
            output, hidden, cell = model.decoder(input, hidden, cell)

            # Extract logits for the current token
            logits = output[0, 0, :] / temperature  # Scale logits with temperature
            probabilities = F.softmax(logits, dim=-1)

            # Apply repetition penalty
            for token_idx in summary:
                probabilities[token_idx] /= repetition_penalty

            # Top-p sampling (nucleus sampling)
            sorted_probs, sorted_indices = torch.sort(probabilities, descending=True)
            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
            cutoff_idx = (cumulative_probs > top_p).nonzero(as_tuple=True)[0][0]
            top_p_probs = sorted_probs[:cutoff_idx + 1]
            top_p_indices = sorted_indices[:cutoff_idx + 1]
            top_p_probs = top_p_probs / top_p_probs.sum()  # Renormalize
            predicted_word_index = torch.multinomial(top_p_probs, 1).item()

            # Debugging: Log probabilities and predictions
            print(f"Step {step}:")
            print(f"  Probabilities (top 10): {sorted_probs[:10]}")
            print(f"  Predicted Word Index: {predicted_word_index} ({vocab.idx2word[predicted_word_index]})")

            # Append the predicted token to the summary
            summary.append(predicted_word_index)
            token_counts[predicted_word_index] = token_counts.get(predicted_word_index, 0) + 1

            # Update input for the next step
            input = torch.tensor([predicted_word_index]).unsqueeze(0).to(model.device)

            # Stop decoding if <eos> token is predicted
            if predicted_word_index == vocab.word2idx['<eos>']:
                break

        # Convert word indices back to words
        summary = [vocab.idx2word[idx] for idx in summary]
        return ' '.join(summary)

# Example inference
article = "kamu ini kok gitu?"
article_tokens = tokenize(article, vocab, max_article_length)

# Generate summary
summary = generate_summary(
    model,
    article_tokens,
    vocab,
    max_summary_length=50,
    temperature=0.7,
    top_k=5,
    top_p=0.9,
    repetition_penalty=1.2
)
print(f"Summary: {summary}")


Step 0:
  Probabilities (top 10): tensor([0.0455, 0.0398, 0.0381, 0.0295, 0.0283, 0.0277, 0.0275, 0.0244, 0.0187,
        0.0139], device='cuda:0')
  Predicted Word Index: 3 (<unk>)
Step 1:
  Probabilities (top 10): tensor([0.0525, 0.0410, 0.0384, 0.0336, 0.0282, 0.0279, 0.0254, 0.0252, 0.0233,
        0.0180], device='cuda:0')
  Predicted Word Index: 17 (dapat)
Step 2:
  Probabilities (top 10): tensor([0.0744, 0.0487, 0.0383, 0.0337, 0.0274, 0.0268, 0.0233, 0.0217, 0.0211,
        0.0208], device='cuda:0')
  Predicted Word Index: 344 (mencari)
Step 3:
  Probabilities (top 10): tensor([0.0994, 0.0459, 0.0456, 0.0300, 0.0273, 0.0243, 0.0242, 0.0205, 0.0191,
        0.0190], device='cuda:0')
  Predicted Word Index: 21 (secara)
Step 4:
  Probabilities (top 10): tensor([0.1117, 0.0530, 0.0437, 0.0359, 0.0273, 0.0244, 0.0229, 0.0208, 0.0204,
        0.0170], device='cuda:0')
  Predicted Word Index: 98 (sebuah)
Step 5:
  Probabilities (top 10): tensor([0.1046, 0.0566, 0.0380, 0.0367, 0.0295,