# **Proyek Analisis Big Data - Fesmaro 2025**

# **Modeling**

- Make model
- Visualisasi model (torchviz) opsional kalo ada yang lain 
- Training (CrossEntropy, Accuracy, Validation Accuracy)
- Visualisasi Training
- Evaluation (pakai classification report, ROC AUC)
- Confussion Matrix
- Save Model
- Inference Model (dalam dashboard)

## BERT-LSTM-GCN (Ensamble Method)

In [1]:
import pandas as pd

final_df=pd.read_csv('./data/final_df.csv')
df_val=pd.read_csv('./data/df_val.csv')
df_test = pd.read_csv('./data/df_test.csv')

In [2]:
final_df.rename(columns={'lemmatized_text': 'text'}, inplace=True)

final_df = final_df[['label', 'text']]
df_val = df_val[['label', 'text']]
df_test = df_test[['label', 'text']]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformers import BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
import spacy
import os
import pickle

# Kelas untuk Graph Convolutional Network (GCN)
class GraphConvolution(nn.Module):
    """
    Layer konvolusi sederhana untuk GCN
    """
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.weight)
        if self.bias is not None:
            torch.nn.init.zeros_(self.bias)

    def forward(self, input, adj):
        # input: [batch_size, seq_len, in_features]
        # adj: [batch_size, seq_len, seq_len]
        
        support = torch.bmm(input, self.weight.expand(input.size(0), -1, -1))
        output = torch.bmm(adj, support)
        
        if self.bias is not None:
            output = output + self.bias
        
        return output

# Model Fusion BERT+BiLSTM+GCN
class BERTBiLSTMGCNModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_dim=128, gcn_layers=2, 
                 lstm_layers=1, dropout=0.2, num_classes=2, linguistic_feat_dim=9):
        super(BERTBiLSTMGCNModel, self).__init__()
        
        # Inisialisasi BERT
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.bert_dim = self.bert.config.hidden_size
        
        # Freeze BERT layers (opsional, untuk efisiensi training)
        # for param in self.bert.parameters():
        #     param.requires_grad = False
        
        # BiLSTM layer
        self.lstm = nn.LSTM(
            input_size=self.bert_dim,
            hidden_size=hidden_dim,
            num_layers=lstm_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if lstm_layers > 1 else 0
        )
        
        # GCN layers
        self.gcn_layers = nn.ModuleList()
        self.gcn_layers.append(GraphConvolution(self.bert_dim, hidden_dim))
        for i in range(1, gcn_layers):
            self.gcn_layers.append(GraphConvolution(hidden_dim, hidden_dim))
        
        # Attention layer untuk weighted pooling
        self.attention = nn.Linear(hidden_dim * 2, 1)  # BiLSTM output dim = hidden_dim * 2
        
        # Classifier untuk BiLSTM
        self.bilstm_classifier = nn.Linear(hidden_dim * 2, hidden_dim)
        
        # Classifier untuk GCN
        self.gcn_classifier = nn.Linear(hidden_dim, hidden_dim)
        
        # Fitur linguistik processing
        self.linguistic_proj = nn.Linear(linguistic_feat_dim, hidden_dim)
        
        # Fusion layer dan classifier akhir
        self.fusion = nn.Linear(hidden_dim * 3, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, num_classes)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_ids, attention_mask, adj_matrix, linguistic_features):
        # BERT forward pass
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_outputs.last_hidden_state  # [batch_size, seq_len, bert_dim]
        
        # BiLSTM forward pass
        lstm_output, _ = self.lstm(sequence_output)  # [batch_size, seq_len, hidden_dim*2]
        
        # Attention mechanism untuk BiLSTM
        attention_weights = F.softmax(self.attention(lstm_output), dim=1)
        lstm_pooled = torch.sum(attention_weights * lstm_output, dim=1)  # [batch_size, hidden_dim*2]
        
        # GCN forward pass
        gcn_output = sequence_output
        for gcn_layer in self.gcn_layers:
            gcn_output = F.leaky_relu(gcn_layer(gcn_output, adj_matrix))
        
        # Global max pooling untuk GCN
        gcn_pooled = torch.max(gcn_output, dim=1)[0]  # [batch_size, hidden_dim]
        
        # Process linguistic features
        ling_features = F.leaky_relu(self.linguistic_proj(linguistic_features))  # [batch_size, hidden_dim]
        
        # Projecting each component to same dimension
        bilstm_features = F.leaky_relu(self.bilstm_classifier(lstm_pooled))  # [batch_size, hidden_dim]
        gcn_features = F.leaky_relu(self.gcn_classifier(gcn_pooled))  # [batch_size, hidden_dim]
        
        # Fusion semua fitur
        combined = torch.cat([bilstm_features, gcn_features, ling_features], dim=1)  # [batch_size, hidden_dim*3]
        fused = F.leaky_relu(self.fusion(combined))  # [batch_size, hidden_dim]
        fused = self.dropout(fused)
        
        # Final classification
        logits = self.classifier(fused)  # [batch_size, num_classes]
        
        return logits

# Ekstraksi Fitur
class FeatureExtractor:
    def __init__(self, bert_model_name='bert-base-uncased', max_length=128, cache_dir=None):
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)
        self.nlp = spacy.load("en_core_web_sm")
        self.max_length = max_length
        self.cache_dir = cache_dir
        
        if cache_dir and not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
    
    def prepare_bert_inputs(self, texts, batch=True):
        """Tokenisasi texts untuk input ke BERT"""
        encodings = self.tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask'],
        }
    
    def build_dependency_graph(self, text, use_cache=True):
        """Membuat graph berdasarkan dependency parsing"""
        if self.cache_dir and use_cache:
            # Create hash for caching
            import hashlib
            text_hash = hashlib.md5(text.encode()).hexdigest()
            cache_file = os.path.join(self.cache_dir, f"dep_graph_{text_hash}.pkl")
            
            if os.path.exists(cache_file):
                with open(cache_file, 'rb') as f:
                    return pickle.load(f)
        
        # Process with SpaCy (limit text length for efficiency)
        doc = self.nlp(text[:1000])
        
        # Get tokens (limited to max_length)
        tokens = [token.text.lower() for token in doc][:self.max_length]
        n = len(tokens)
        
        # Initialize adjacency matrix
        adjacency_matrix = np.zeros((self.max_length, self.max_length))
        
        # Fill adjacency matrix based on dependencies
        for token in doc:
            if token.i < n and token.head.i < n:
                # Add edge between token and its head
                adjacency_matrix[token.i, token.head.i] = 1
                adjacency_matrix[token.head.i, token.i] = 1
        
        # Add self-loops
        adjacency_matrix = adjacency_matrix + np.eye(self.max_length)
        
        # Normalize adjacency matrix (important for GCN)
        rowsum = np.array(adjacency_matrix.sum(1))
        d_inv_sqrt = np.power(rowsum, -0.5).flatten()
        d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
        d_mat_inv_sqrt = np.diag(d_inv_sqrt)
        normalized_adj = d_mat_inv_sqrt.dot(adjacency_matrix).dot(d_mat_inv_sqrt)
        
        result = torch.FloatTensor(normalized_adj)
        
        # Cache result if cache directory is provided
        if self.cache_dir and use_cache:
            with open(cache_file, 'wb') as f:
                pickle.dump(result, f)
        
        return result
    
    def extract_linguistic_features(self, text):
        """Ekstrak fitur linguistik untuk analisis sentimen"""
        features = {}
        
        # Text statistics
        features['text_length'] = min(len(text), 1000) / 1000  # Normalized length
        words = str(text).split()
        features['word_count'] = min(len(words), 200) / 200  # Normalized count
        features['avg_word_length'] = min(np.mean([len(word) for word in words]) if words else 0, 20) / 20
        
        # Sentiment indicators
        features['exclamation_count'] = min(text.count('!'), 10) / 10
        features['question_count'] = min(text.count('?'), 10) / 10
        features['uppercase_word_count'] = min(sum(1 for word in words if word.isupper() and len(word) > 1), 20) / 20
        features['uppercase_ratio'] = min(features['uppercase_word_count'] * 200 / (len(words) + 1), 1.0)
        
        # Sentiment lexicon features
        positive_words = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'best', 'love', 
                         'perfect', 'recommend', 'happy', 'awesome']
        negative_words = ['bad', 'poor', 'terrible', 'horrible', 'worst', 'waste', 'disappointed', 
                         'disappointing', 'difficult', 'hate', 'problem', 'issue', 'fail']
        
        features['positive_word_count'] = min(sum(1 for word in text.lower().split() if word in positive_words), 20) / 20
        features['negative_word_count'] = min(sum(1 for word in text.lower().split() if word in negative_words), 20) / 20
        features['sentiment_word_ratio'] = min((features['positive_word_count'] * 20 + 1) / 
                                            (features['negative_word_count'] * 20 + 1), 10) / 10
        
        return torch.FloatTensor([
            features['text_length'], 
            features['word_count'],
            features['avg_word_length'],
            features['exclamation_count'],
            features['question_count'],
            features['uppercase_ratio'],
            features['positive_word_count'],
            features['negative_word_count'],
            features['sentiment_word_ratio']
        ])
    
    def batch_extract_features(self, texts):
        """Ekstrak fitur untuk batch texts"""
        # Prepare BERT inputs
        bert_inputs = self.prepare_bert_inputs(texts)
        
        # Build dependency graphs
        adj_matrices = []
        for text in texts:
            adj_matrices.append(self.build_dependency_graph(text))
        adj_matrices = torch.stack(adj_matrices)
        
        # Extract linguistic features
        linguistic_features = []
        for text in texts:
            linguistic_features.append(self.extract_linguistic_features(text))
        linguistic_features = torch.stack(linguistic_features)
        
        return {
            'input_ids': bert_inputs['input_ids'],
            'attention_mask': bert_inputs['attention_mask'],
            'adj_matrices': adj_matrices,
            'linguistic_features': linguistic_features
        }

# Dataset yang menggunakan FeatureExtractor
class AmazonReviewDataset(Dataset):
    def __init__(self, reviews, labels, feature_extractor, precompute=False):
        self.reviews = reviews
        self.labels = labels
        self.feature_extractor = feature_extractor
        self.precomputed_features = None
        
        if precompute:
            print("Precomputing features...")
            self.precomputed_features = self.feature_extractor.batch_extract_features(reviews)
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        if self.precomputed_features:
            features = {
                'input_ids': self.precomputed_features['input_ids'][idx],
                'attention_mask': self.precomputed_features['attention_mask'][idx],
                'adj_matrix': self.precomputed_features['adj_matrices'][idx],
                'linguistic_features': self.precomputed_features['linguistic_features'][idx],
                'label': torch.tensor(self.labels[idx], dtype=torch.long)
            }
        else:
            text = self.reviews[idx]
            label = self.labels[idx]
            
            # Extract features for single text
            bert_inputs = self.feature_extractor.prepare_bert_inputs([text], batch=False)
            adj_matrix = self.feature_extractor.build_dependency_graph(text)
            linguistic_features = self.feature_extractor.extract_linguistic_features(text)
            
            features = {
                'input_ids': bert_inputs['input_ids'].squeeze(0),
                'attention_mask': bert_inputs['attention_mask'].squeeze(0),
                'adj_matrix': adj_matrix,
                'linguistic_features': linguistic_features,
                'label': torch.tensor(label, dtype=torch.long)
            }
        
        return features

# Fungsi training dan evaluasi
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        adj_matrix = batch['adj_matrix'].to(device)
        linguistic_features = batch['linguistic_features'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, adj_matrix, linguistic_features)
        loss = criterion(outputs, labels)
        
        # Backward pass dan update
        loss.backward()
        optimizer.step()
        
        # Metrics
        epoch_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    return epoch_loss / len(dataloader), accuracy

def evaluate(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            adj_matrix = batch['adj_matrix'].to(device)
            linguistic_features = batch['linguistic_features'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask, adj_matrix, linguistic_features)
            loss = criterion(outputs, labels)
            
            # Metrics
            epoch_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    return epoch_loss / len(dataloader), accuracy

# Fungsi main untuk training
def train_model(train_reviews, train_labels, val_reviews, val_labels, 
                epochs=5, batch_size=16, learning_rate=2e-5, cache_dir='./feature_cache'):
    
    # Inisialisasi feature extractor
    feature_extractor = FeatureExtractor(cache_dir=cache_dir)
    
    # Buat dataset
    train_dataset = AmazonReviewDataset(train_reviews, train_labels, feature_extractor)
    val_dataset = AmazonReviewDataset(val_reviews, val_labels, feature_extractor)
    
    # Buat dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    
    # Inisialisasi model
    # device = torch_directml.device()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BERTBiLSTMGCNModel().to(device)
    
    # Optimizer dan loss
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    # Training loop
    best_val_acc = 0
    
    for epoch in range(epochs):
        # Training
        train_loss, train_acc = train_epoch(model, train_dataloader, optimizer, criterion, device)
        
        # Validation
        val_loss, val_acc = evaluate(model, val_dataloader, criterion, device)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')
            print("Saved best model!")
    
    return model

# Fungsi untuk batch processing pada dataset besar
def process_large_dataset(reviews, labels, batch_size=10000, cache_dir='./feature_cache'):
    """
    Process dataset besar dengan batch untuk menghindari bottleneck memori
    """
    feature_extractor = FeatureExtractor(cache_dir=cache_dir)
    
    for i in range(0, len(reviews), batch_size):
        end_idx = min(i + batch_size, len(reviews))
        batch_reviews = reviews[i:end_idx]
        batch_labels = labels[i:end_idx]
        
        print(f"Processing batch {i//batch_size + 1}/{len(reviews)//batch_size + 1}")
        
        # Extract and cache features
        batch_features = feature_extractor.batch_extract_features(batch_reviews)
        
        # Save features
        torch.save({
            'input_ids': batch_features['input_ids'],
            'attention_mask': batch_features['attention_mask'],
            'adj_matrices': batch_features['adj_matrices'],
            'linguistic_features': batch_features['linguistic_features'],
            'labels': torch.tensor(batch_labels, dtype=torch.long)
        }, f"{cache_dir}/batch_{i//batch_size}.pt")
        
    print("Preprocessing complete!")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
x_train= final_df['text']
y_train = final_df['label']  # 1 = positive, 0 = negative

x_val = df_val['text']
y_val = df_val['label']

# Train model
model = train_model(x_train, y_train, x_val, y_val, epochs=5)

# Untuk dataset besar
# process_large_dataset(all_reviews, all_labels)

print("Training complete!")



KeyboardInterrupt: 

## BERT-LSTM-CCN (Ensamble Method)

## BERT

## LSTM

## CNN

## GCN