## Proyek Analisis Sentimen Dicoding

In [48]:
import pandas as pd

In [49]:
df = pd.read_csv("./dataset/duolingo_review.csv")
df.drop(columns=["userName", "userImage", "replyContent", "repliedAt", "appVersion", "reviewCreatedVersion"], axis=1, inplace=True)
df.head()

Unnamed: 0,reviewId,review,score,thumbsUpCount,at
0,b5691293-970e-422e-a708-208e4cf5744e,Baguss banget Aplikasinya ✨️🔥 sangat membantu ...,5,6,2025-04-23 14:23:11
1,558f02c7-4fa5-4086-b70d-4513b23bc989,"Duolingo memang baguss polll, aku suka banget....",3,36,2025-04-05 12:42:35
2,29e68c3d-cd29-404d-b63b-21cbb0bf06f8,Aplikasi keren saya banyak belajar lewat aplik...,5,43,2025-04-22 05:46:12
3,9803c43e-346d-4ecd-bd50-3d794cb3fc4c,"Untuk Developer, Tolong dong saat kita melakuk...",4,5,2025-04-22 00:24:08
4,a0caa63f-f6e5-49e7-a843-e996e22c4b25,"Suka bgtt sma Duolingo ini, belajar jdi lebih ...",5,18,2025-04-22 19:08:05


### Data Preprocessing

In [50]:
# Clean Dataset
clean_df = df.dropna()

# Drop Duplicate
clean_df = clean_df.drop_duplicates()
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   reviewId       18000 non-null  object
 1   review         18000 non-null  object
 2   score          18000 non-null  int64 
 3   thumbsUpCount  18000 non-null  int64 
 4   at             18000 non-null  object
dtypes: int64(2), object(3)
memory usage: 703.3+ KB


### Text Preprocessing

In [51]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [52]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka
    text = re.sub(r'(.)\1+', r'\1', text)  # menggantikan huruf yang berulang menjadi satu huruf
    
    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text
 
def casefoldingText(text): # Mengubah semua karakter dalam teks menjadi huruf kecil
    text = text.lower()
    return text
 
def tokenizingText(text): # Memecah atau membagi string, teks menjadi daftar token
    text = word_tokenize(text)
    return text
 
def filteringText(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text
 
def stemmingText(text): # Mengurangi kata ke bentuk dasarnya yang menghilangkan imbuhan awalan dan akhiran atau ke akar kata
    # Membuat objek stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
 
    # Memecah teks menjadi daftar kata
    words = text.split()
 
    # Menerapkan stemming pada setiap kata dalam daftar
    stemmed_words = [stemmer.stem(word) for word in words]
 
    # Menggabungkan kata-kata yang telah distem
    stemmed_text = ' '.join(stemmed_words)
 
    return stemmed_text
 
def toSentence(list_words): # Mengubah daftar kata menjadi kalimat
    sentence = ' '.join(word for word in list_words)
    return sentence

slangwords = {"@": "di", "abis": "habis", "wtb": "beli", "masi": "masih", "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal"}
def fix_slangwords(text):
    words = text.split()
    fixed_words = []
 
    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)
 
    fixed_text = ' '.join(fixed_words)
    return fixed_text


In [53]:
# Membersihkan teks dan menyimpannya di kolom 'text_clean'
clean_df['text_clean'] = clean_df['review'].apply(cleaningText)
 
# Mengubah huruf dalam teks menjadi huruf kecil dan menyimpannya di 'text_casefoldingText'
clean_df['text_casefoldingText'] = clean_df['text_clean'].apply(casefoldingText)
 
# Mengganti kata-kata slang dengan kata-kata standar dan menyimpannya di 'text_slangwords'
clean_df['text_slangwords'] = clean_df['text_casefoldingText'].apply(fix_slangwords)
 
# Memecah teks menjadi token (kata-kata) dan menyimpannya di 'text_tokenizingText'
clean_df['text_tokenizingText'] = clean_df['text_slangwords'].apply(tokenizingText)
 
# Menghapus kata-kata stop (kata-kata umum) dan menyimpannya di 'text_stopword'
clean_df['text_stopword'] = clean_df['text_tokenizingText'].apply(filteringText)
 
# Menggabungkan token-token menjadi kalimat dan menyimpannya di 'text_akhir'
clean_df['text_akhir'] = clean_df['text_stopword'].apply(toSentence)

In [54]:
clean_df.head()

Unnamed: 0,reviewId,review,score,thumbsUpCount,at,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir
0,b5691293-970e-422e-a708-208e4cf5744e,Baguss banget Aplikasinya ✨️🔥 sangat membantu ...,5,6,2025-04-23 14:23:11,Bagus banget Aplikasinya sangat membantu dalam...,bagus banget aplikasinya sangat membantu dalam...,bagus banget aplikasinya sangat membantu dalam...,"[bagus, banget, aplikasinya, sangat, membantu,...","[bagus, banget, aplikasinya, membantu, mengasa...",bagus banget aplikasinya membantu mengasah ski...
1,558f02c7-4fa5-4086-b70d-4513b23bc989,"Duolingo memang baguss polll, aku suka banget....",3,36,2025-04-05 12:42:35,Duolingo memang bagus pol aku suka banget yang...,duolingo memang bagus pol aku suka banget yang...,duolingo memang bagus pol aku suka banget yang...,"[duolingo, memang, bagus, pol, aku, suka, bang...","[duolingo, bagus, pol, suka, banget, akhir², i...",duolingo bagus pol suka banget akhir² iklanya ...
2,29e68c3d-cd29-404d-b63b-21cbb0bf06f8,Aplikasi keren saya banyak belajar lewat aplik...,5,43,2025-04-22 05:46:12,Aplikasi keren saya banyak belajar lewat aplik...,aplikasi keren saya banyak belajar lewat aplik...,aplikasi keren saya banyak belajar lewat aplik...,"[aplikasi, keren, saya, banyak, belajar, lewat...","[aplikasi, keren, belajar, aplikasi, kekuranga...",aplikasi keren belajar aplikasi kekuranganya s...
3,9803c43e-346d-4ecd-bd50-3d794cb3fc4c,"Untuk Developer, Tolong dong saat kita melakuk...",4,5,2025-04-22 00:24:08,Untuk Developer Tolong dong sat kita melakukan...,untuk developer tolong dong sat kita melakukan...,untuk developer tolong dong sat kita melakukan...,"[untuk, developer, tolong, dong, sat, kita, me...","[developer, tolong, sat, kesalahan, kekurangan...",developer tolong sat kesalahan kekurangan huru...
4,a0caa63f-f6e5-49e7-a843-e996e22c4b25,"Suka bgtt sma Duolingo ini, belajar jdi lebih ...",5,18,2025-04-22 19:08:05,Suka bgt sma Duolingo ini belajar jdi lebih se...,suka bgt sma duolingo ini belajar jdi lebih se...,suka banget sma duolingo ini belajar jdi lebih...,"[suka, banget, sma, duolingo, ini, belajar, jd...","[suka, banget, sma, duolingo, belajar, jdi, se...",suka banget sma duolingo belajar jdi seru muda...


### Labeling

In [55]:
import csv
import requests
from io import StringIO

In [56]:
# Membaca data kamus kata-kata positif dari GitHub
lexicon_positive = dict()
 
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub
 
if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma
 
    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_positive[row[0]] = int(row[1])
        # Menambahkan kata-kata positif dan skornya ke dalam kamus lexicon_positive
else:
    print("Failed to fetch positive lexicon data")
 
# Membaca data kamus kata-kata negatif dari GitHub
lexicon_negative = dict()
 
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub
 
if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma
 
    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_negative[row[0]] = int(row[1])
        # Menambahkan kata-kata negatif dan skornya dalam kamus lexicon_negative
else:
    print("Failed to fetch negative lexicon data")

In [57]:
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rizfi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [58]:
def sentiment_analysis_lexicon_indonesia(text):
    #for word in text:
 
    score = 0
    # Inisialisasi skor sentimen ke 0
 
    for word in text:
        # Mengulangi setiap kata dalam teks
 
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]
            # Jika kata ada dalam kamus positif, tambahkan skornya ke skor sentimen
 
    for word in text:
        # Mengulangi setiap kata dalam teks (sekali lagi)
 
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]
            # Jika kata ada dalam kamus negatif, kurangkan skornya dari skor sentimen
 
    polarity=''
    # Inisialisasi variabel polaritas
 
    if (score > 0):
        polarity = 'positive'
        # Jika skor sentimen lebih besar atau sama dengan 0, maka polaritas adalah positif
    elif (score < 0):
        polarity = 'negative'
        # Jika skor sentimen kurang dari 0, maka polaritas adalah negatif
    else:
        polarity = 'neutral'
 
    return score, polarity

In [59]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def sentiment_analysis_lexicon_vader(text):
    scores = sia.polarity_scores(text)
    
    # Extract individual scores
    neg_score = scores["neg"]
    neu_score = scores["neu"]
    pos_score = scores["pos"]
    compound = scores["compound"]
    
    # Determine polarity based on the highest score
    if neu_score > neg_score and neu_score > pos_score:
        polarity = "neutral"
    elif pos_score > neg_score:
        polarity = "positive"
    else:
        polarity = "negative"
    
    return compound, polarity, scores

In [60]:
results = clean_df['text_stopword'].apply(sentiment_analysis_lexicon_indonesia)
# results = clean_df["text_akhir"].apply(sentiment_analysis_lexicon_vader)
results = list(zip(*results))
clean_df['polarity_score'] = results[0]
clean_df['polarity'] = results[1]
clean_df["polarity"].value_counts()

polarity
negative    10057
positive     6670
neutral      1273
Name: count, dtype: int64

In [61]:
clean_df.head()

Unnamed: 0,reviewId,review,score,thumbsUpCount,at,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir,polarity_score,polarity
0,b5691293-970e-422e-a708-208e4cf5744e,Baguss banget Aplikasinya ✨️🔥 sangat membantu ...,5,6,2025-04-23 14:23:11,Bagus banget Aplikasinya sangat membantu dalam...,bagus banget aplikasinya sangat membantu dalam...,bagus banget aplikasinya sangat membantu dalam...,"[bagus, banget, aplikasinya, sangat, membantu,...","[bagus, banget, aplikasinya, membantu, mengasa...",bagus banget aplikasinya membantu mengasah ski...,6,positive
1,558f02c7-4fa5-4086-b70d-4513b23bc989,"Duolingo memang baguss polll, aku suka banget....",3,36,2025-04-05 12:42:35,Duolingo memang bagus pol aku suka banget yang...,duolingo memang bagus pol aku suka banget yang...,duolingo memang bagus pol aku suka banget yang...,"[duolingo, memang, bagus, pol, aku, suka, bang...","[duolingo, bagus, pol, suka, banget, akhir², i...",duolingo bagus pol suka banget akhir² iklanya ...,-2,negative
2,29e68c3d-cd29-404d-b63b-21cbb0bf06f8,Aplikasi keren saya banyak belajar lewat aplik...,5,43,2025-04-22 05:46:12,Aplikasi keren saya banyak belajar lewat aplik...,aplikasi keren saya banyak belajar lewat aplik...,aplikasi keren saya banyak belajar lewat aplik...,"[aplikasi, keren, saya, banyak, belajar, lewat...","[aplikasi, keren, belajar, aplikasi, kekuranga...",aplikasi keren belajar aplikasi kekuranganya s...,7,positive
3,9803c43e-346d-4ecd-bd50-3d794cb3fc4c,"Untuk Developer, Tolong dong saat kita melakuk...",4,5,2025-04-22 00:24:08,Untuk Developer Tolong dong sat kita melakukan...,untuk developer tolong dong sat kita melakukan...,untuk developer tolong dong sat kita melakukan...,"[untuk, developer, tolong, dong, sat, kita, me...","[developer, tolong, sat, kesalahan, kekurangan...",developer tolong sat kesalahan kekurangan huru...,-26,negative
4,a0caa63f-f6e5-49e7-a843-e996e22c4b25,"Suka bgtt sma Duolingo ini, belajar jdi lebih ...",5,18,2025-04-22 19:08:05,Suka bgt sma Duolingo ini belajar jdi lebih se...,suka bgt sma duolingo ini belajar jdi lebih se...,suka banget sma duolingo ini belajar jdi lebih...,"[suka, banget, sma, duolingo, ini, belajar, jd...","[suka, banget, sma, duolingo, belajar, jdi, se...",suka banget sma duolingo belajar jdi seru muda...,12,positive


In [62]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

x = clean_df["text_akhir"]
y = clean_df["polarity"]
encoded_label = label_encoder.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, encoded_label, test_size=0.2, random_state=42)

#### TFIDF

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_x_train = tfidf.fit_transform(x_train)
tfidf_x_test = tfidf.transform(x_test)

#### Word2Vec

In [64]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import nltk
import numpy as np

nltk.download("punkt")

X_train_processed = [simple_preprocess(text) for text in x_train]
X_test_processed = [simple_preprocess(text) for text in x_test]

w2v_model = Word2Vec(sentences=X_train_processed, vector_size=100, window=5, min_count=1, workers=4)

def get_document_vector(doc, model):
    word_vectors = [model.wv[word] for word in doc if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)
    
x_train_vec = np.array([get_document_vector(doc, w2v_model) for doc in X_train_processed])
x_test_vec = np.array([get_document_vector(doc, w2v_model) for doc in X_test_processed])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rizfi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Bag Of Words

In [65]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

bow_x_train = vectorizer.fit_transform(x_train)
bow_x_test = vectorizer.transform(x_test)

### Modelling

In [66]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB

In [67]:
hdd

NameError: name 'hdd' is not defined

#### Naive Bayes

In [None]:
# Naive Bayes Using TFIDF
naive_bayes_tfidf = BernoulliNB()

naive_bayes_tfidf.fit(tfidf_x_train, y_train)

y_pred_train_nb = naive_bayes_tfidf.predict(tfidf_x_train)
y_pred_test_nb = naive_bayes_tfidf.predict(tfidf_x_test)

accuracy_train_nb = accuracy_score(y_pred_train_nb, y_train)
accuracy_test_nb = accuracy_score(y_pred_test_nb, y_test)
 
print('Naive Bayes (TFIDF) - accuracy_train:', accuracy_train_nb)
print('Naive Bayes (TFIDF) - accuracy_test:', accuracy_test_nb)


Naive Bayes (TFIDF) - accuracy_train: 0.8264583333333333
Naive Bayes (TFIDF) - accuracy_test: 0.735


#### Logistic Regression

In [68]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression()

logistic_regression.fit(x_train_vec, y_train)

y_pred_train_lr = logistic_regression.predict(x_train_vec)
y_pred_test_lr = logistic_regression.predict(x_test_vec)

accuracy_train_lr = accuracy_score(y_pred_train_lr, y_train)
 
accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)
 
print('Logistic Regression - accuracy_train:', accuracy_train_lr)
print('Logistic Regression - accuracy_test:', accuracy_test_lr)

Logistic Regression - accuracy_train: 0.7273611111111111
Logistic Regression - accuracy_test: 0.7266666666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### RNN

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, SimpleRNN, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam


In [None]:
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_seq, maxlen=max_len, padding="post")
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len, padding="post")

In [None]:
embedding_dim = 128

# model = Sequential([
#     Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
#     SimpleRNN(128, return_sequences=True),
#     Dropout(0.5),
#     SimpleRNN(64, return_sequences=False),
#     BatchNormalization(),
#     Dropout(0.5),
#     Dense(64, activation='relu'),
#     Dense(3, activation='softmax')
# ])

model = Sequential([
    # Embedding with slightly larger dimensions for richer word representation
    Embedding(input_dim=max_words, output_dim=200, input_length=max_len),
    
    # First RNN with increased units and recurrent dropout for regularization
    SimpleRNN(256, return_sequences=True, recurrent_dropout=0.2),
    BatchNormalization(),  # Added before dropout for more stable training
    Dropout(0.3),  # Reduced slightly from 0.5 to avoid underfitting
    
    # Second RNN with additional regularization
    SimpleRNN(128, return_sequences=False, recurrent_dropout=0.2),
    BatchNormalization(),
    Dropout(0.3),
    
    # Multiple dense layers with gradually decreasing units for better feature extraction
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(64, activation='relu'),
    BatchNormalization(),
    
    # Output layer
    Dense(3, activation='softmax')
])

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model.summary()



In [None]:
class CustomCallback(Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get("accuracy") > 92 and logs.get("val_accuracy") > 0.92):
            print("\nProses training dihentikan karena Akurasi telah melampaui 92%")
            self.model.stop_training = True

In [None]:
custom_callback = CustomCallback()

In [None]:
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint("best_model_rnn.h5", monitor="val_loss", save_best_only=True)

history1 = model.fit(
    x_train_pad,
    y_train, 
    epochs=20, 
    batch_size=64, 
    validation_data=(x_test_pad, y_test),
    verbose=1,
    callbacks=[model_checkpoint, custom_callback]        
)

Epoch 1/20
[1m 49/225[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m27s[0m 155ms/step - accuracy: 0.3439 - loss: 1.4354

KeyboardInterrupt: 

#### LSTM

In [None]:
model_lstm = Sequential([
    # Embedding layer
    Embedding(
        input_dim=max_words, 
        output_dim=embedding_dim, 
        embeddings_regularizer=tf.keras.regularizers.l2(1e-5)
    ),
    
    # First Bidirectional LSTM layer
    Bidirectional(
        LSTM(
            128, 
            return_sequences=True, 
            dropout=0.2,
            kernel_regularizer=tf.keras.regularizers.l2(1e-5)
        )
    ),
    BatchNormalization(),
    
    # Second Bidirectional LSTM layer
    Bidirectional(
        LSTM(
            64, 
            return_sequences=False, 
            dropout=0.2
        )
    ),
    BatchNormalization(),
    Dropout(0.3),
    
    # Fully connected dense layer
    Dense(
        64, 
        activation='relu', 
        kernel_regularizer=tf.keras.regularizers.l2(1e-5)
    ),
    BatchNormalization(),
    Dropout(0.3),
    
    # Output layer
    Dense(3, activation='softmax')
])

optimizer = Adam(learning_rate=0.001, clipnorm=1.0)  # Gradient clipping helps with exploding gradients
model_lstm.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model_lstm.summary()

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))

In [None]:
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint("best_model_lstm.h5", monitor="val_loss", save_best_only=True)

history2 = model_lstm.fit(
    x_train_pad,
    y_train, 
    epochs=10, 
    batch_size=64, 
    validation_data=(x_test_pad, y_test),
    verbose=1,
    callbacks=[model_checkpoint, custom_callback],
    class_weight=class_weights    
)

Epoch 1/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step - accuracy: 0.9689 - loss: 0.1467



[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 263ms/step - accuracy: 0.9689 - loss: 0.1468 - val_accuracy: 0.8747 - val_loss: 0.5055
Epoch 2/10
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step - accuracy: 0.9459 - loss: 0.2137

In [None]:
test_loss, test_acc = model_lstm.evaluate(x_test_pad, y_test)
train_acc = history2.history["accuracy"][-1]

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 44ms/step - accuracy: 0.8642 - loss: 0.5713
Train Accuracy: 0.9804
Test Accuracy: 0.8669
