Langkah 1: Convert to Vector (TF-IDF, GloVe, Word2Vec)
- Pada langkah ini, kita akan memproses teks dan mengubahnya menjadi representasi vektor menggunakan metode TF-IDF, GloVe, atau Word2Vec.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load data
train_data = pd.read_csv('train_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
valid_data = pd.read_csv('valid_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
test_data = pd.read_csv('test_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])

# Combine train and valid data for tokenization
combined_data = pd.concat([train_data, valid_data])

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_data['text'])

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
valid_sequences = tokenizer.texts_to_sequences(valid_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Pad sequences
max_len = 100  # You can adjust this based on your dataset
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
valid_padded = pad_sequences(valid_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Load GloVe embeddings
def load_glove_embeddings(file_path, word_index, embedding_dim=100):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_dim = 100
embedding_matrix = load_glove_embeddings('glove.6B.100d.txt', tokenizer.word_index, embedding_dim)


Langkah 2: Create Model
- Kita akan membuat model DNN menggunakan TensorFlow dan Keras.

In [2]:
# Create the model
model = models.Sequential()
model.add(layers.Embedding(input_dim=len(tokenizer.word_index) + 1, 
                           output_dim=embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=max_len, 
                           trainable=False))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(3, activation='softmax'))  # Assuming 3 classes: positive, neutral, negative

# Compile the model
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Summary of the model
model.summary()




Langkah 3: Model Evaluation
- Pada tahap ini, kita akan melatih model yang sudah dibuat dan melakukan evaluasi terhadap performa model tersebut.

In [3]:
# Convert labels to integer format
label_mapping = {'positive': 0, 'neutral': 1, 'negative': 2}
train_labels = train_data['label'].map(label_mapping).astype(int)
valid_labels = valid_data['label'].map(label_mapping).astype(int)
test_labels = test_data['label'].map(label_mapping).astype(int)

# Train the model
history = model.fit(train_padded, train_labels, 
                    epochs=10, 
                    validation_data=(valid_padded, valid_labels), 
                    batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(test_padded, test_labels)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')


Epoch 1/10
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5950 - loss: 0.8489 - val_accuracy: 0.7302 - val_loss: 0.6582
Epoch 2/10
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 793us/step - accuracy: 0.7189 - loss: 0.6668 - val_accuracy: 0.7667 - val_loss: 0.6120
Epoch 3/10
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 772us/step - accuracy: 0.7441 - loss: 0.6202 - val_accuracy: 0.7778 - val_loss: 0.5873
Epoch 4/10
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 766us/step - accuracy: 0.7531 - loss: 0.5999 - val_accuracy: 0.7778 - val_loss: 0.5756
Epoch 5/10
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 789us/step - accuracy: 0.7542 - loss: 0.5935 - val_accuracy: 0.7817 - val_loss: 0.5671
Epoch 6/10
[1m344/344[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 787us/step - accuracy: 0.7572 - loss: 0.5908 - val_accuracy: 0.7841 - val_loss: 0.5581
Epoch 7/10
[1m344

Langkah 4: Hyperparameter Tuning
- Melakukan tuning terhadap hyperparameter untuk meningkatkan kinerja model.

In [5]:
from kerastuner.tuners import RandomSearch

def build_model(hp):
    model = models.Sequential()
    model.add(layers.Embedding(input_dim=len(tokenizer.word_index) + 1, 
                               output_dim=embedding_dim, 
                               weights=[embedding_matrix], 
                               input_length=max_len, 
                               trainable=False))
    model.add(layers.GlobalAveragePooling1D())
    model.add(layers.Dense(hp.Int('units', min_value=32, max_value=512, step=32), activation='relu'))
    model.add(layers.Dropout(hp.Float('dropout', 0.0, 0.5, step=0.1)))
    model.add(layers.Dense(3, activation='softmax'))
    model.compile(optimizer='adam', 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

tuner = RandomSearch(build_model,
                     objective='val_accuracy',
                     max_trials=5,
                     executions_per_trial=3,
                     directory='my_dir',
                     project_name='sentiment_analysis')

tuner.search(train_padded, train_labels, 
             epochs=10, 
             validation_data=(valid_padded, valid_labels), 
             batch_size=32)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]


Trial 5 Complete [00h 00m 11s]
val_accuracy: 0.7878307104110718

Best val_accuracy So Far: 0.7920634945233663
Total elapsed time: 00h 00m 54s


Langkah 5: Iterate
- Melakukan iterasi untuk meningkatkan performa model berdasarkan hasil evaluasi dan tuning hyperparameter.

In [None]:
# Build the model with the best hyperparameters
best_model = tuner.hypermodel.build(best_hps)

# Train the best model
best_model.fit(train_padded, train_labels, 
               epochs=10, 
               validation_data=(valid_padded, valid_labels), 
               batch_size=32)

# Evaluate the best model
best_loss, best_accuracy = best_model.evaluate(test_padded, test_labels)
print(f'Best Test Loss: {best_loss}')
print(f'Best Test Accuracy: {best_accuracy}')
