In [1]:
!pip install PySastrawi

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl.metadata (892 bytes)
Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0


In [2]:
# General Libraries
import pandas as pd
import numpy as np
import re
import multiprocessing
import time
import nltk
import torch
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
from torch.utils.data import DataLoader, TensorDataset
from torch import nn, optim

# Machine Learning Libraries
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# NLTK Downloads
nltk.download('punkt')
nltk.download('stopwords')

# === Define Device ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Load Dataset ===
data_path = "/kaggle/input/korlantas-polri-application-reviews/ulasan-korlantas.csv"
data = pd.read_csv(data_path)
print("\nDataset loaded successfully!\n")
print("Head of dataset before preprocessing:")
print(data.head(), "\n")  # Show dataset head before preprocessing
data.dropna(subset=['content'], inplace=True)

# === Sentiment Mapping ===
def rating_to_sentiment(rating):
    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

data['label'] = data['score'].apply(rating_to_sentiment)

# === Initialize Stemmer ===
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# === Stopwords Setup ===
try:
    stop_words = set(stopwords.words('indonesian'))
except OSError:
    print("Indonesian stopwords not found in NLTK, defaulting to English stopwords.")
    stop_words = set(stopwords.words('english'))

# === Preprocessing Function ===
def preprocess_text(text):
    try:
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A)  # Remove non-alphabet characters
        text = text.lower()  # Lowercase the text
        tokens = nltk.word_tokenize(text)  # Tokenize the text
        tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords

        # Apply stemming using multiprocessing
        with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
            tokens = pool.map(stemmer.stem, tokens)

        return ' '.join(tokens)
    except Exception as e:
        print(f"Error in preprocessing text: {text}\nException: {e}")
        return ""

# Apply preprocessing to the 'content' column
start_time = time.time()
data['cleaned_content'] = data['content'].apply(preprocess_text)
print(f"\nPreprocessing complete in {time.time() - start_time:.2f} seconds!\n")
print("Head of dataset after preprocessing:")
print(data.head(), "\n")  # Show dataset head after preprocessing

# === Word2Vec Embedding ===
sentences = [nltk.word_tokenize(row) for row in data['cleaned_content'] if row]
w2v_model = Word2Vec(sentences, vector_size=300, window=5, min_count=5, workers=multiprocessing.cpu_count())
w2v_model.train(sentences, total_examples=len(sentences), epochs=10)
print("\nWord2Vec model trained successfully!\n")

def text_to_vector(text):
    tokens = nltk.word_tokenize(text)
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

# Convert text to vectors
data['vectors'] = data['cleaned_content'].apply(text_to_vector)
print("\nWord2Vec embedding complete!\n")

# === Split Dataset ===
X = np.array(data['vectors'].tolist())
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# === SMOTE Oversampling ===
print("\nData before SMOTE oversampling:")
print(f"X_train shape: {X_train.shape}, y_train distribution:\n{y_train.value_counts()}\n")

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\nSMOTE complete!\n")
print("Data after SMOTE oversampling:")
print(f"X_train_smote shape: {X_train_smote.shape}, y_train_smote distribution:\n{y_train_smote.value_counts()}\n")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Dataset loaded successfully!

Head of dataset before preprocessing:
        userName  score                   at  \
0   arum diajeng      3  2023-12-29 11:12:17   
1    Iga Dwiyana      5  2023-12-29 09:56:34   
2    Abdul Fatah      1  2023-12-29 09:44:19   
3  Erdo Prasetya      5  2023-12-29 08:21:18   
4    ilham yusuf      4  2023-12-29 08:04:46   

                                             content  
0  saya kemarin udah tes psi, tes kesehatan dan b...  
1              Cepat sekali 3hari sudah diterima sim  
2  Ini gimana ,kok sulit sekali untuk verifikasi ...  
3  Mantap sih dr test sampai diterima simnya hany...  
4  Untuk pembuatan sim baru secara online belum b...   


Preprocessing complete in 1805.04 seconds!

Head of dataset after pre

In [3]:
# === Machine Learning Models ===
def train_ml_models(X_train, y_train, X_test, y_test):
    models = {
        'SVM': SVC(kernel='linear', probability=True, random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'LightGBM': lgb.LGBMClassifier(random_state=42)
    }

    results = {}

    for model_name, model in models.items():
        print(f"\nTraining {model_name}...\n")
        # Train the model
        model.fit(X_train, y_train)

        # Train evaluation
        y_train_pred = model.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_loss = None
        if hasattr(model, 'predict_proba'):
            y_train_pred_proba = model.predict_proba(X_train)
            train_loss = log_loss(y_train, y_train_pred_proba, labels=['negative', 'neutral', 'positive'])

        # Validation (Test) evaluation
        y_test_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_loss = None
        if hasattr(model, 'predict_proba'):
            y_test_pred_proba = model.predict_proba(X_test)
            test_loss = log_loss(y_test, y_test_pred_proba, labels=['negative', 'neutral', 'positive'])

        print(f"Train Accuracy for {model_name}: {train_accuracy:.4f}")
        if train_loss is not None:
            print(f"Train Log Loss for {model_name}: {train_loss:.4f}")
        else:
            print(f"Train Log Loss for {model_name}: Not available (no predict_proba method)")

        print(f"Validation Accuracy for {model_name}: {test_accuracy:.4f}")
        if test_loss is not None:
            print(f"Validation Log Loss for {model_name}: {test_loss:.4f}")
        else:
            print(f"Validation Log Loss for {model_name}: Not available (no predict_proba method)")

        # Additional metrics (confusion matrix and classification report)
        cm = confusion_matrix(y_test, y_test_pred, labels=['negative', 'neutral', 'positive'])
        print(f"\nConfusion Matrix for {model_name}:\n")
        print(cm)

        report = classification_report(y_test, y_test_pred, target_names=['negative', 'neutral', 'positive'])
        print(f"\nClassification Report for {model_name}:\n")
        print(report)

        # Save results
        results[model_name] = {
            'train_accuracy': train_accuracy,
            'train_loss': train_loss,
            'validation_accuracy': test_accuracy,
            'validation_loss': test_loss,
            'confusion_matrix': cm,
            'classification_report': report
        }

    return results

print("\n=== Training Machine Learning Models ===\n")
ml_results = train_ml_models(X_train_smote, y_train_smote, X_test, y_test)



=== Training Machine Learning Models ===


Training SVM...

Train Accuracy for SVM: 0.6653
Train Log Loss for SVM: 0.7676
Validation Accuracy for SVM: 0.6625
Validation Log Loss for SVM: 0.7438

Confusion Matrix for SVM:

[[579 295  65]
 [ 65 126  33]
 [ 57 160 620]]

Classification Report for SVM:

              precision    recall  f1-score   support

    negative       0.83      0.62      0.71       939
     neutral       0.22      0.56      0.31       224
    positive       0.86      0.74      0.80       837

    accuracy                           0.66      2000
   macro avg       0.64      0.64      0.61      2000
weighted avg       0.77      0.66      0.70      2000


Training Random Forest...

Train Accuracy for Random Forest: 0.9996
Train Log Loss for Random Forest: 0.1436
Validation Accuracy for Random Forest: 0.7280
Validation Log Loss for Random Forest: 0.6930

Confusion Matrix for Random Forest:

[[757 100  82]
 [133  39  52]
 [118  59 660]]

Classification Report for Rand

In [4]:
# === Unified Hyperparameter Configuration ===
hyperparams = {
    "input_size": 300,        # Input vector size
    "hidden_size": 256,       # Hidden size for LSTM
    "num_filters": 256,       # Number of filters for CNN
    "kernel_sizes": [3],      # Kernel sizes for CNN
    "hidden_units": 256,      # Fully connected layer units for CNN
    "output_size": len(data['label'].unique()),  # Number of output classes
    "dropout": 0.3,           # Dropout rate
    "learning_rate": 1e-3,    # Learning rate
    "batch_size": 64,         # Batch size for DataLoader
    "num_epochs": 50,         # Number of epochs for training
    "num_layers": 3           # Number of layers for LSTM
}

# === Convert Data to Tensors ===
X_train_tensor = torch.tensor(X_train_smote, dtype=torch.float32)
y_train_tensor = torch.tensor(pd.Categorical(y_train_smote).codes, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(pd.Categorical(y_test).codes, dtype=torch.long).to(device)

# === DataLoader ===
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=hyperparams["batch_size"], shuffle=True)

val_dataset = TensorDataset(X_test_tensor, y_test_tensor)
val_loader = DataLoader(val_dataset, batch_size=hyperparams["batch_size"], shuffle=False)


In [5]:
# === Define LSTM Model ===
class SentimentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout):
        super(SentimentLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add sequence dimension
        lstm_out, (h_n, _) = self.lstm(x)
        output = self.fc(self.dropout(h_n[-1]))
        return output

# === Train LSTM Model ===
def train_model(model, train_loader, val_loader, optimizer, loss_fn, scheduler, num_epochs):
    for epoch in range(num_epochs):
        # === Training Phase ===
        model.train()
        total_loss, correct = 0, 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = loss_fn(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            correct += (outputs.argmax(1) == y_batch).sum().item()

        train_loss = total_loss / len(train_loader)
        train_acc = correct / len(train_loader.dataset)

        # Step the scheduler after each epoch
        scheduler.step()

        # === Validation Phase ===
        model.eval()
        val_loss, val_correct = 0, 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = loss_fn(outputs, y_batch)
                val_loss += loss.item()
                val_correct += (outputs.argmax(1) == y_batch).sum().item()

        val_loss /= len(val_loader)
        val_acc = val_correct / len(val_loader.dataset)

        # Print Epoch Results
        print(f"Epoch {epoch + 1}/{num_epochs}:")
        print(f"    Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
        print(f"    Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

# === Evaluate LSTM Model ===
def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        predictions = outputs.argmax(1)
        acc = (predictions == y_test).float().mean().item()
        cm = confusion_matrix(y_test.cpu(), predictions.cpu())
        cr = classification_report(y_test.cpu(), predictions.cpu(), target_names=['negative', 'neutral', 'positive'])
    return acc, cm, cr

# Initialize LSTM Model
lstm_model = SentimentLSTM(
    input_size=hyperparams["input_size"],
    hidden_size=hyperparams["hidden_size"],
    output_size=hyperparams["output_size"],
    num_layers=hyperparams["num_layers"],
    dropout=hyperparams["dropout"]
).to(device)
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=hyperparams["learning_rate"])
lstm_loss_fn = nn.CrossEntropyLoss()
lstm_scheduler = optim.lr_scheduler.StepLR(lstm_optimizer, step_size=10, gamma=0.1)

# Train LSTM
print("\n=== Training LSTM Model ===\n")
train_model(lstm_model, train_loader, val_loader, lstm_optimizer, lstm_loss_fn, lstm_scheduler, hyperparams["num_epochs"])

# Evaluate LSTM
print("\n=== Evaluating LSTM Model ===\n")
accuracy, confusion_mat, class_report = evaluate_model(lstm_model, X_test_tensor, y_test_tensor)
print(f"LSTM Test Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(confusion_mat)
print("\nClassification Report:")
print(class_report)


# === Define CNN Model ===
class SentimentCNN(nn.Module):
    def __init__(self, input_size, num_filters, kernel_sizes, hidden_units, output_size, dropout):
        super(SentimentCNN, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=input_size,
                      out_channels=num_filters,
                      kernel_size=k,
                      padding=k // 2)  # Add padding to maintain dimensions
            for k in kernel_sizes
        ])
        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(num_filters * len(kernel_sizes), hidden_units)
        self.dropout1 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_units, output_size)

    def forward(self, x):
        x = x.unsqueeze(1).permute(0, 2, 1)  # Reshape for CNN
        convs_out = []
        for conv in self.convs:
            conv_out = torch.relu(conv(x))
            pooled_out = self.global_pool(conv_out).squeeze(2)
            convs_out.append(pooled_out)

        concatenated = torch.cat(convs_out, dim=1)
        x = self.dropout1(torch.relu(self.fc1(concatenated)))
        output = self.fc2(x)
        return output

# === Train CNN Model ===
def train_cnn_model(model, train_loader, val_loader, optimizer, loss_fn, num_epochs):
    for epoch in range(num_epochs):
        # === Training Phase ===
        model.train()
        running_loss, correct_predictions = 0.0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            correct_predictions += (outputs.argmax(1) == labels).sum().item()

        train_loss = running_loss / len(train_loader.dataset)
        train_acc = correct_predictions / len(train_loader.dataset)

        # === Validation Phase ===
        model.eval()
        val_running_loss, val_correct_predictions = 0.0, 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = model(inputs)
                loss = loss_fn(outputs, labels)

                val_running_loss += loss.item() * inputs.size(0)
                val_correct_predictions += (outputs.argmax(1) == labels).sum().item()

        val_loss = val_running_loss / len(val_loader.dataset)
        val_acc = val_correct_predictions / len(val_loader.dataset)

        # Print Epoch Results
        print(f"Epoch {epoch + 1}/{num_epochs}:")
        print(f"    Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
        print(f"    Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

# === Evaluate CNN Model ===
def evaluate_cnn(model, X_test_tensor, y_test_tensor):
    model.eval()
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        acc = (predicted == y_test_tensor).sum().item() / len(y_test_tensor)
        cm = confusion_matrix(y_test_tensor.cpu(), predicted.cpu())
        cr = classification_report(y_test_tensor.cpu(), predicted.cpu(), target_names=['negative', 'neutral', 'positive'])
    return acc, cm, cr

# Initialize CNN Model
cnn_model = SentimentCNN(
    input_size=hyperparams["input_size"],
    num_filters=hyperparams["num_filters"],
    kernel_sizes=hyperparams["kernel_sizes"],
    hidden_units=hyperparams["hidden_units"],
    output_size=hyperparams["output_size"],
    dropout=hyperparams["dropout"]
).to(device)
cnn_optimizer = optim.Adam(cnn_model.parameters(), lr=hyperparams["learning_rate"])
cnn_loss_fn = nn.CrossEntropyLoss()

# Train CNN
print("\n=== Training CNN Model ===\n")
train_cnn_model(cnn_model, train_loader, val_loader, cnn_optimizer, cnn_loss_fn, hyperparams["num_epochs"])

# Evaluate CNN
print("\n=== Evaluating CNN Model ===\n")
cnn_acc, cnn_cm, cnn_cr = evaluate_cnn(cnn_model, X_test_tensor, y_test_tensor)
print(f"CNN Test Accuracy: {cnn_acc:.4f}")
print("Confusion Matrix:")
print(cnn_cm)
print("\nClassification Report:")
print(cnn_cr)



=== Training LSTM Model ===

Epoch 1/50:
    Train Loss: 0.8473, Train Accuracy: 0.5982
    Val Loss: 0.7582, Val Accuracy: 0.6145
Epoch 2/50:
    Train Loss: 0.7833, Train Accuracy: 0.6387
    Val Loss: 0.7546, Val Accuracy: 0.6140
Epoch 3/50:
    Train Loss: 0.7726, Train Accuracy: 0.6451
    Val Loss: 0.7773, Val Accuracy: 0.5990
Epoch 4/50:
    Train Loss: 0.7628, Train Accuracy: 0.6514
    Val Loss: 0.6929, Val Accuracy: 0.6770
Epoch 5/50:
    Train Loss: 0.7537, Train Accuracy: 0.6544
    Val Loss: 0.7069, Val Accuracy: 0.6610
Epoch 6/50:
    Train Loss: 0.7491, Train Accuracy: 0.6597
    Val Loss: 0.7185, Val Accuracy: 0.6530
Epoch 7/50:
    Train Loss: 0.7369, Train Accuracy: 0.6636
    Val Loss: 0.7039, Val Accuracy: 0.6655
Epoch 8/50:
    Train Loss: 0.7293, Train Accuracy: 0.6668
    Val Loss: 0.7306, Val Accuracy: 0.6425
Epoch 9/50:
    Train Loss: 0.7231, Train Accuracy: 0.6766
    Val Loss: 0.6830, Val Accuracy: 0.6885
Epoch 10/50:
    Train Loss: 0.7238, Train Accuracy: