## Proyek Analisis Sentimen Dicoding

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./dataset/duolingo_review.csv")
df.drop(columns=["userName", "userImage", "replyContent", "repliedAt", "appVersion", "reviewCreatedVersion"], axis=1, inplace=True)
df.head()

Unnamed: 0,reviewId,review,score,thumbsUpCount,at
0,b5691293-970e-422e-a708-208e4cf5744e,Baguss banget Aplikasinya ✨️🔥 sangat membantu ...,5,6,2025-04-23 14:23:11
1,558f02c7-4fa5-4086-b70d-4513b23bc989,"Duolingo memang baguss polll, aku suka banget....",3,36,2025-04-05 12:42:35
2,29e68c3d-cd29-404d-b63b-21cbb0bf06f8,Aplikasi keren saya banyak belajar lewat aplik...,5,43,2025-04-22 05:46:12
3,9803c43e-346d-4ecd-bd50-3d794cb3fc4c,"Untuk Developer, Tolong dong saat kita melakuk...",4,5,2025-04-22 00:24:08
4,a0caa63f-f6e5-49e7-a843-e996e22c4b25,"Suka bgtt sma Duolingo ini, belajar jdi lebih ...",5,18,2025-04-22 19:08:05


### Data Preprocessing

In [3]:
# Clean Dataset
clean_df = df.dropna()

# Drop Duplicate
clean_df = clean_df.drop_duplicates()
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   reviewId       18000 non-null  object
 1   review         18000 non-null  object
 2   score          18000 non-null  int64 
 3   thumbsUpCount  18000 non-null  int64 
 4   at             18000 non-null  object
dtypes: int64(2), object(3)
memory usage: 703.3+ KB


### Text Preprocessing

In [4]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

nltk.download("all")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_

True

In [5]:
def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka
    text = re.sub(r'(.)\1+', r'\1', text)  # menggantikan huruf yang berulang menjadi satu huruf
    
    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text
 
def casefoldingText(text): # Mengubah semua karakter dalam teks menjadi huruf kecil
    text = text.lower()
    return text
 
def tokenizingText(text): # Memecah atau membagi string, teks menjadi daftar token
    text = word_tokenize(text)
    return text
 
def filteringText(text): # Menghapus stopwords dalam teks
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords1 = set(stopwords.words('english'))
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text
 
def stemmingText(text): # Mengurangi kata ke bentuk dasarnya yang menghilangkan imbuhan awalan dan akhiran atau ke akar kata
    # Membuat objek stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
 
    # Memecah teks menjadi daftar kata
    words = text.split()
 
    # Menerapkan stemming pada setiap kata dalam daftar
    stemmed_words = [stemmer.stem(word) for word in words]
 
    # Menggabungkan kata-kata yang telah distem
    stemmed_text = ' '.join(stemmed_words)
 
    return stemmed_text
 
def toSentence(list_words): # Mengubah daftar kata menjadi kalimat
    sentence = ' '.join(word for word in list_words)
    return sentence

slangwords = {"@": "di", "abis": "habis", "wtb": "beli", "masi": "masih", "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal"}
def fix_slangwords(text):
    words = text.split()
    fixed_words = []
 
    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)
 
    fixed_text = ' '.join(fixed_words)
    return fixed_text


In [6]:
# Membersihkan teks dan menyimpannya di kolom 'text_clean'
clean_df['text_clean'] = clean_df['review'].apply(cleaningText)
 
# Mengubah huruf dalam teks menjadi huruf kecil dan menyimpannya di 'text_casefoldingText'
clean_df['text_casefoldingText'] = clean_df['text_clean'].apply(casefoldingText)
 
# Mengganti kata-kata slang dengan kata-kata standar dan menyimpannya di 'text_slangwords'
clean_df['text_slangwords'] = clean_df['text_casefoldingText'].apply(fix_slangwords)
 
# Memecah teks menjadi token (kata-kata) dan menyimpannya di 'text_tokenizingText'
clean_df['text_tokenizingText'] = clean_df['text_slangwords'].apply(tokenizingText)
 
# Menghapus kata-kata stop (kata-kata umum) dan menyimpannya di 'text_stopword'
clean_df['text_stopword'] = clean_df['text_tokenizingText'].apply(filteringText)
 
# Menggabungkan token-token menjadi kalimat dan menyimpannya di 'text_akhir'
clean_df['text_akhir'] = clean_df['text_stopword'].apply(toSentence)

In [7]:
clean_df.head()

Unnamed: 0,reviewId,review,score,thumbsUpCount,at,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir
0,b5691293-970e-422e-a708-208e4cf5744e,Baguss banget Aplikasinya ✨️🔥 sangat membantu ...,5,6,2025-04-23 14:23:11,Bagus banget Aplikasinya sangat membantu dalam...,bagus banget aplikasinya sangat membantu dalam...,bagus banget aplikasinya sangat membantu dalam...,"[bagus, banget, aplikasinya, sangat, membantu,...","[bagus, banget, aplikasinya, membantu, mengasa...",bagus banget aplikasinya membantu mengasah ski...
1,558f02c7-4fa5-4086-b70d-4513b23bc989,"Duolingo memang baguss polll, aku suka banget....",3,36,2025-04-05 12:42:35,Duolingo memang bagus pol aku suka banget yang...,duolingo memang bagus pol aku suka banget yang...,duolingo memang bagus pol aku suka banget yang...,"[duolingo, memang, bagus, pol, aku, suka, bang...","[duolingo, bagus, pol, suka, banget, akhir², i...",duolingo bagus pol suka banget akhir² iklanya ...
2,29e68c3d-cd29-404d-b63b-21cbb0bf06f8,Aplikasi keren saya banyak belajar lewat aplik...,5,43,2025-04-22 05:46:12,Aplikasi keren saya banyak belajar lewat aplik...,aplikasi keren saya banyak belajar lewat aplik...,aplikasi keren saya banyak belajar lewat aplik...,"[aplikasi, keren, saya, banyak, belajar, lewat...","[aplikasi, keren, belajar, aplikasi, kekuranga...",aplikasi keren belajar aplikasi kekuranganya s...
3,9803c43e-346d-4ecd-bd50-3d794cb3fc4c,"Untuk Developer, Tolong dong saat kita melakuk...",4,5,2025-04-22 00:24:08,Untuk Developer Tolong dong sat kita melakukan...,untuk developer tolong dong sat kita melakukan...,untuk developer tolong dong sat kita melakukan...,"[untuk, developer, tolong, dong, sat, kita, me...","[developer, tolong, sat, kesalahan, kekurangan...",developer tolong sat kesalahan kekurangan huru...
4,a0caa63f-f6e5-49e7-a843-e996e22c4b25,"Suka bgtt sma Duolingo ini, belajar jdi lebih ...",5,18,2025-04-22 19:08:05,Suka bgt sma Duolingo ini belajar jdi lebih se...,suka bgt sma duolingo ini belajar jdi lebih se...,suka banget sma duolingo ini belajar jdi lebih...,"[suka, banget, sma, duolingo, ini, belajar, jd...","[suka, banget, sma, duolingo, belajar, jdi, se...",suka banget sma duolingo belajar jdi seru muda...


### Labeling

In [8]:
import csv
import requests
from io import StringIO

In [9]:
# Membaca data kamus kata-kata positif dari GitHub
lexicon_positive = dict()
 
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub
 
if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma
 
    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_positive[row[0]] = int(row[1])
        # Menambahkan kata-kata positif dan skornya ke dalam kamus lexicon_positive
else:
    print("Failed to fetch positive lexicon data")
 
# Membaca data kamus kata-kata negatif dari GitHub
lexicon_negative = dict()
 
response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')
# Mengirim permintaan HTTP untuk mendapatkan file CSV dari GitHub
 
if response.status_code == 200:
    # Jika permintaan berhasil
    reader = csv.reader(StringIO(response.text), delimiter=',')
    # Membaca teks respons sebagai file CSV menggunakan pembaca CSV dengan pemisah koma
 
    for row in reader:
        # Mengulangi setiap baris dalam file CSV
        lexicon_negative[row[0]] = int(row[1])
        # Menambahkan kata-kata negatif dan skornya dalam kamus lexicon_negative
else:
    print("Failed to fetch negative lexicon data")

In [10]:
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [11]:
def sentiment_analysis_lexicon_indonesia(text):
    #for word in text:
 
    score = 0
    # Inisialisasi skor sentimen ke 0
 
    for word in text:
        # Mengulangi setiap kata dalam teks
 
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]
            # Jika kata ada dalam kamus positif, tambahkan skornya ke skor sentimen
 
    for word in text:
        # Mengulangi setiap kata dalam teks (sekali lagi)
 
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]
            # Jika kata ada dalam kamus negatif, kurangkan skornya dari skor sentimen
 
    polarity=''
    # Inisialisasi variabel polaritas
 
    if (score > 0):
        polarity = 'positive'
        # Jika skor sentimen lebih besar atau sama dengan 0, maka polaritas adalah positif
    elif (score < 0):
        polarity = 'negative'
        # Jika skor sentimen kurang dari 0, maka polaritas adalah negatif
    else:
        polarity = 'neutral'
 
    return score, polarity

In [12]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

def sentiment_analysis_lexicon_vader(text):
    scores = sia.polarity_scores(text)
    
    # Extract individual scores
    neg_score = scores["neg"]
    neu_score = scores["neu"]
    pos_score = scores["pos"]
    compound = scores["compound"]
    
    # Determine polarity based on the highest score
    if neu_score > neg_score and neu_score > pos_score:
        polarity = "neutral"
    elif pos_score > neg_score:
        polarity = "positive"
    else:
        polarity = "negative"
    
    return compound, polarity, scores

In [13]:
results = clean_df['text_stopword'].apply(sentiment_analysis_lexicon_indonesia)
# results = clean_df["text_akhir"].apply(sentiment_analysis_lexicon_vader)
results = list(zip(*results))
clean_df['polarity_score'] = results[0]
clean_df['polarity'] = results[1]
clean_df["polarity"].value_counts()

polarity
negative    10057
positive     6670
neutral      1273
Name: count, dtype: int64

In [14]:
clean_df.head()

Unnamed: 0,reviewId,review,score,thumbsUpCount,at,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir,polarity_score,polarity
0,b5691293-970e-422e-a708-208e4cf5744e,Baguss banget Aplikasinya ✨️🔥 sangat membantu ...,5,6,2025-04-23 14:23:11,Bagus banget Aplikasinya sangat membantu dalam...,bagus banget aplikasinya sangat membantu dalam...,bagus banget aplikasinya sangat membantu dalam...,"[bagus, banget, aplikasinya, sangat, membantu,...","[bagus, banget, aplikasinya, membantu, mengasa...",bagus banget aplikasinya membantu mengasah ski...,6,positive
1,558f02c7-4fa5-4086-b70d-4513b23bc989,"Duolingo memang baguss polll, aku suka banget....",3,36,2025-04-05 12:42:35,Duolingo memang bagus pol aku suka banget yang...,duolingo memang bagus pol aku suka banget yang...,duolingo memang bagus pol aku suka banget yang...,"[duolingo, memang, bagus, pol, aku, suka, bang...","[duolingo, bagus, pol, suka, banget, akhir², i...",duolingo bagus pol suka banget akhir² iklanya ...,-2,negative
2,29e68c3d-cd29-404d-b63b-21cbb0bf06f8,Aplikasi keren saya banyak belajar lewat aplik...,5,43,2025-04-22 05:46:12,Aplikasi keren saya banyak belajar lewat aplik...,aplikasi keren saya banyak belajar lewat aplik...,aplikasi keren saya banyak belajar lewat aplik...,"[aplikasi, keren, saya, banyak, belajar, lewat...","[aplikasi, keren, belajar, aplikasi, kekuranga...",aplikasi keren belajar aplikasi kekuranganya s...,7,positive
3,9803c43e-346d-4ecd-bd50-3d794cb3fc4c,"Untuk Developer, Tolong dong saat kita melakuk...",4,5,2025-04-22 00:24:08,Untuk Developer Tolong dong sat kita melakukan...,untuk developer tolong dong sat kita melakukan...,untuk developer tolong dong sat kita melakukan...,"[untuk, developer, tolong, dong, sat, kita, me...","[developer, tolong, sat, kesalahan, kekurangan...",developer tolong sat kesalahan kekurangan huru...,-26,negative
4,a0caa63f-f6e5-49e7-a843-e996e22c4b25,"Suka bgtt sma Duolingo ini, belajar jdi lebih ...",5,18,2025-04-22 19:08:05,Suka bgt sma Duolingo ini belajar jdi lebih se...,suka bgt sma duolingo ini belajar jdi lebih se...,suka banget sma duolingo ini belajar jdi lebih...,"[suka, banget, sma, duolingo, ini, belajar, jd...","[suka, banget, sma, duolingo, belajar, jdi, se...",suka banget sma duolingo belajar jdi seru muda...,12,positive


In [59]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

x = clean_df["text_akhir"]
y = clean_df["polarity"]
encoded_label = label_encoder.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, encoded_label, test_size=0.2, random_state=42)
x_train_70, x_test_30, y_train_70, y_test_30 = train_test_split(x, encoded_label, test_size=0.30, random_state=42)
x_train_80, x_test_20, y_train_80, y_test_20 = train_test_split(x, encoded_label, test_size=0.20, random_state=42)

#### Word2Vec

In [17]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import nltk
import numpy as np

nltk.download("punkt")

X_train_processed = [simple_preprocess(text) for text in x_train]
X_test_processed = [simple_preprocess(text) for text in x_test]

w2v_model = Word2Vec(sentences=X_train_processed, vector_size=100, window=5, min_count=1, workers=4)

def get_document_vector(doc, model):
    word_vectors = [model.wv[word] for word in doc if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)
    
x_train_vec = np.array([get_document_vector(doc, w2v_model) for doc in X_train_processed])
x_test_vec = np.array([get_document_vector(doc, w2v_model) for doc in X_test_processed])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Modelling

In [None]:
from sklearn.metrics import accuracy_score

#### Pipeline

In [20]:
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
    
# Helper function to create Word2Vec features
def word2vec_features(texts, model):
    features = []
    for text in texts:
        # Tokenize and average word vectors
        words = text.split()  # simple tokenization (you can use more advanced tokenizers)
        word_vectors = [model.wv[word] for word in words if word in model.wv]
        
        # If no words in the document are in the Word2Vec model, append a zero vector
        if word_vectors:
            features.append(np.mean(word_vectors, axis=0))
        else:
            features.append(np.zeros(model.vector_size))  # Zero vector for empty word vectors
    
    return np.array(features)

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return word2vec_features(X, self.model)

# ---- TF-IDF Pipelines ----
tfidf_nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

tfidf_lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter=1000, class_weight=class_weights))
])

tfidf_rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(class_weight=class_weights))
])

# ---- BoW   Pipeline ----
bow_nb_pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('nb', MultinomialNB())
])

bow_lr_pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=1000, class_weight=class_weights))
])

bow_rf_pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('rf', RandomForestClassifier(class_weight=class_weights))
])

# ---- Word2Vec Pipelines ----
word2vec_lr_pipeline = Pipeline([
    ('word2vec', Word2VecTransformer(w2v_model)),  # You can optimize the model by using pre-trained Word2Vec
    ('lr', LogisticRegression(max_iter=1000, class_weight=class_weights))
])

word2vec_rf_pipeline = Pipeline([
    ('word2vec', Word2VecTransformer(w2v_model)),  # You can optimize the model by using pre-trained Word2Vec
    ('rf', RandomForestClassifier(class_weight=class_weights))
])

# Parameter Grid
param_grids = {
    'TF-IDF + Naive Bayes': {
        'tfidf__ngram_range': [(1,1), (1,2), (2, 2)],
        'tfidf__min_df': [1, 5, 10],
        'tfidf__max_df': [0.85, 0.9, 1.0],
        'nb__alpha': [0.5, 1.0]
    },
    'TF-IDF + Logistic Regression': {
        'tfidf__ngram_range': [(1,1), (1,2)],
        'tfidf__min_df': [1, 5],
        'tfidf__max_df': [0.9, 1.0],
        'lr__C': [0.1, 1, 10]
    },
    'TF-IDF + Random Forest': {
        'tfidf__ngram_range': [(1,1)],
        'rf__n_estimators': [100, 200],
        'rf__max_depth': [None, 10, 20]
    },
    'BoW + Naive Bayes': {
        'bow__ngram_range': [(1,1), (1,2), (2, 2)],
        'bow__min_df': [1, 5, 10],
        'bow__max_df': [0.85, 0.9, 1.0],
        'nb__alpha': [0.5, 1.0]
    },
    'BoW + Logistic Regression': {
        'bow__ngram_range': [(1,1), (1,2)],
        'bow__min_df': [1, 5],
        'bow__max_df': [0.9, 1.0],
        'lr__C': [0.1, 1, 10]
    },
    'BoW + Random Forest': {
        'bow__ngram_range': [(1,1)],
        'rf__n_estimators': [100, 200],
        'rf__max_depth': [None, 10, 20]
    },
    'Word2Vec + Logistic Regression': {
        'lr__C': [0.1, 1, 10]
    },
    'Word2Vec + Random Forest': {
        'rf__n_estimators': [100, 200],
        'rf__max_depth': [None, 10, 20]
    }
}

# ---- Training each model ----
pipelines = {
    'TF-IDF + Naive Bayes': tfidf_nb_pipeline,
    'TF-IDF + Logistic Regression': tfidf_lr_pipeline,
    'TF-IDF + Random Forest': tfidf_rf_pipeline,
    'BoW + Naive Bayes': bow_nb_pipeline,
    'BoW + Logistic Regression': bow_lr_pipeline,
    'BoW + Random Forest': bow_rf_pipeline,
    'Word2Vec + Logistic Regression': word2vec_lr_pipeline,
    'Word2Vec + Random Forest': word2vec_rf_pipeline
}

In [72]:
from sklearn.model_selection import GridSearchCV
import joblib
import os

# Create a directory to store saved models
os.makedirs("saved_models", exist_ok=True)

results = []

def train_and_evaluate_with_gridsearch(pipeline, name, param_grid, X_train, X_test, y_train, y_test, split_ratio):
    print(f"Training with {split_ratio} split...")
    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)

    print(f"Evaluating with {split_ratio} split...")
    best_params = grid.best_params_
    best_cv_score = grid.best_score_
    test_accuracy = grid.score(X_test, y_test)

    print(f"Best Parameters: {best_params}")
    print(f"Best CV Accuracy: {best_cv_score}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

    # Store results for the current split
    results.append({
        "name" : f"{split_ratio}_{name}",
        "split_ratio": split_ratio,
        "best_params": best_params,
        "cv_score": best_cv_score,
        "test_accuracy": test_accuracy,
        "estimator": grid.best_estimator_
    })

for name, pipeline in pipelines.items():
    print(f"\nTraining and evaluating pipeline: {name}...")

    # Evaluate for 70:30 split
    train_and_evaluate_with_gridsearch(pipeline, name, param_grids[name], x_train_70, x_test_30, y_train_70, y_test_30, "70:30")

    # Evaluate for 80:20 split
    train_and_evaluate_with_gridsearch(pipeline, name, param_grids[name], x_train_80, x_test_20, y_train_80, y_test_20, "80:20")

# Train and evaluate each pipeline
# for name, pipeline in pipelines.items():
#     print(f"Training {name}...")
#     grid = GridSearchCV(pipeline, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
#     grid.fit(x_train, y_train)

#     print(f"Evaluating {name}...")
#     best_params = grid.best_params_
#     best_cv_score = grid.best_score_
#     test_accuracy = grid.score(x_test, y_test)

#     print("Best Parameters:", best_params)
#     print("Best CV Accuracy:", best_cv_score)
#     print(f"Test Accuracy: {test_accuracy:.4f}")
    
#     # Save the model
#     model_filename = f"saved_models/{name}_model.pkl"
#     joblib.dump(grid.best_estimator_, model_filename)

#     # Store results
#     results.append({
#         "name": name,
#         "model_path": model_filename,
#         "best_params": best_params,
#         "cv_score": best_cv_score,
#         "test_accuracy": test_accuracy,
#         "estimator": grid.best_estimator_ 
#     })



Training and evaluating pipeline: TF-IDF + Naive Bayes...
Training with 70:30 split...
Evaluating with 70:30 split...
Best Parameters: {'nb__alpha': 0.5, 'tfidf__max_df': 0.85, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}
Best CV Accuracy: 0.7585714285714286
Test Accuracy: 0.7622
Training with 80:20 split...
Evaluating with 80:20 split...
Best Parameters: {'nb__alpha': 0.5, 'tfidf__max_df': 0.85, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2)}
Best CV Accuracy: 0.7593055555555556
Test Accuracy: 0.7656

Training and evaluating pipeline: TF-IDF + Logistic Regression...
Training with 70:30 split...
Evaluating with 70:30 split...
Best Parameters: {'lr__C': 10, 'tfidf__max_df': 0.9, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1)}
Best CV Accuracy: 0.8546031746031748
Test Accuracy: 0.8541
Training with 80:20 split...
Evaluating with 80:20 split...
Best Parameters: {'lr__C': 10, 'tfidf__max_df': 0.9, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1)}
Best CV Accuracy: 0.85875
Test A

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating with 80:20 split...
Best Parameters: {'lr__C': 10}
Best CV Accuracy: 0.6551388888888889
Test Accuracy: 0.6639

Training and evaluating pipeline: Word2Vec + Random Forest...
Training with 70:30 split...
Evaluating with 70:30 split...
Best Parameters: {'rf__max_depth': 20, 'rf__n_estimators': 100}
Best CV Accuracy: 0.6859523809523809
Test Accuracy: 0.6926
Training with 80:20 split...
Evaluating with 80:20 split...
Best Parameters: {'rf__max_depth': 20, 'rf__n_estimators': 200}
Best CV Accuracy: 0.6934722222222223
Test Accuracy: 0.6936


In [83]:
# Sort by best CV accuracy in descending order
top_models = sorted(results, key=lambda x: x['cv_score'], reverse=True)[:3]

# Print top 3 models
print("\nTop 3 Models:")
for i, model_info in enumerate(top_models, 1):
    print(f"\nRank {i}: {model_info['name']}")
    print(f"CV Accuracy: {model_info['cv_score']:.4f}")
    print(f"Test Accuracy: {model_info['test_accuracy']:.4f}")
    print(f"Best Params: {model_info['best_params']}")
    model_name_split = model_info["name"].split(":")
    model_name = "_".join(model_name_split)
    joblib.dump(model_info["estimator"], f"saved_models/{model_name}.pkl")
    # print(f"Model Saved At: {model_info['model_path']}")


Top 3 Models:

Rank 1: 80:20_BoW + Logistic Regression
CV Accuracy: 0.8653
Test Accuracy: 0.8664
Best Params: {'bow__max_df': 0.9, 'bow__min_df': 1, 'bow__ngram_range': (1, 1), 'lr__C': 10}

Rank 2: 80:20_TF-IDF + Logistic Regression
CV Accuracy: 0.8588
Test Accuracy: 0.8594
Best Params: {'lr__C': 10, 'tfidf__max_df': 0.9, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 1)}

Rank 3: 70:30_BoW + Logistic Regression
CV Accuracy: 0.8583
Test Accuracy: 0.8630
Best Params: {'bow__max_df': 0.9, 'bow__min_df': 1, 'bow__ngram_range': (1, 1), 'lr__C': 10}


### Deep Learning

In [23]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional, BatchNormalization, LayerNormalization
from tensorflow.keras.callbacks import  ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2


In [24]:
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_seq, maxlen=max_len, padding="post")
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len, padding="post")

In [25]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))

In [26]:
class CustomCallback(Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get("accuracy") > 92 and logs.get("val_accuracy") > 0.92):
            print("\nProses training dihentikan karena Akurasi telah melampaui 92%")
            self.model.stop_training = True

In [27]:
custom_callback = CustomCallback()

#### LSTM

In [28]:
embedding_dim = 128

model_lstm = Sequential([
    # Embedding layer
    Embedding(
        input_dim=max_words, 
        output_dim=embedding_dim, 
        embeddings_regularizer=l2(1e-5)
    ),
    
    # First Bidirectional LSTM layer
    Bidirectional(
        LSTM(
            128, 
            return_sequences=True, 
            dropout=0.2,
            kernel_regularizer=l2(1e-5)
        )
    ),
    BatchNormalization(),
    
    # Second Bidirectional LSTM layer
    Bidirectional(
        LSTM(
            64, 
            return_sequences=False, 
            dropout=0.2
        )
    ),
    BatchNormalization(),
    Dropout(0.3),
    
    # Fully connected dense layer
    Dense(
        64, 
        activation='relu', 
        kernel_regularizer=l2(1e-5)
    ),
    BatchNormalization(),
    Dropout(0.3),
    
    # Output layer
    Dense(3, activation='softmax')
])

optimizer = Adam(learning_rate=0.001, clipnorm=1.0)  # Gradient clipping helps with exploding gradients
model_lstm.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model_lstm.summary()

In [29]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))

In [None]:
model_checkpoint = ModelCheckpoint("saved_models/best_model_lstm.h5", monitor="val_loss", save_best_only=True)

history_lstm = model_lstm.fit(
    x_train_pad,
    y_train, 
    epochs=20, 
    batch_size=64, 
    validation_data=(x_test_pad, y_test),
    verbose=1,
    callbacks=[model_checkpoint, custom_callback],
    class_weight=class_weights    
)

Epoch 1/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step - accuracy: 0.4580 - loss: 1.3395



[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 282ms/step - accuracy: 0.4584 - loss: 1.3385 - val_accuracy: 0.5997 - val_loss: 0.8663
Epoch 2/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step - accuracy: 0.7586 - loss: 0.6694



[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 256ms/step - accuracy: 0.7586 - loss: 0.6692 - val_accuracy: 0.7511 - val_loss: 0.5451
Epoch 3/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 259ms/step - accuracy: 0.8196 - loss: 0.5031 - val_accuracy: 0.7783 - val_loss: 0.5565
Epoch 4/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 257ms/step - accuracy: 0.8625 - loss: 0.3924 - val_accuracy: 0.7300 - val_loss: 0.7675
Epoch 5/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 308ms/step - accuracy: 0.8967 - loss: 0.3227 - val_accuracy: 0.7414 - val_loss: 0.7635
Epoch 6/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 332ms/step - accuracy: 0.9044 - loss: 0.2999 - val_accuracy: 0.5528 - val_loss: 1.6489
Epoch 7/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 308ms/step - accuracy: 0.9163 - loss: 0.2818 - val_accuracy: 0.7947 - val_loss: 0.6156
Epoch 8/20
[1m225/22



[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 278ms/step - accuracy: 0.9239 - loss: 0.2527 - val_accuracy: 0.8367 - val_loss: 0.4860
Epoch 9/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 275ms/step - accuracy: 0.9325 - loss: 0.2348 - val_accuracy: 0.8597 - val_loss: 0.5073
Epoch 10/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step - accuracy: 0.9426 - loss: 0.1999



[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 270ms/step - accuracy: 0.9426 - loss: 0.1999 - val_accuracy: 0.8744 - val_loss: 0.4768
Epoch 11/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 267ms/step - accuracy: 0.9537 - loss: 0.1819 - val_accuracy: 0.8525 - val_loss: 0.5689
Epoch 12/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 267ms/step - accuracy: 0.9579 - loss: 0.1568 - val_accuracy: 0.8772 - val_loss: 0.5495
Epoch 13/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 269ms/step - accuracy: 0.9522 - loss: 0.1719 - val_accuracy: 0.8481 - val_loss: 0.6618
Epoch 14/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 267ms/step - accuracy: 0.9587 - loss: 0.1534 - val_accuracy: 0.8581 - val_loss: 0.5830
Epoch 15/20
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 268ms/step - accuracy: 0.9692 - loss: 0.1299 - val_accuracy: 0.8331 - val_loss: 0.6903
Epoch 16/20
[1m

In [31]:
test_loss, test_acc = model_lstm.evaluate(x_test_pad, y_test)
train_acc = history_lstm.history["accuracy"][-1]

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 60ms/step - accuracy: 0.8656 - loss: 0.5691
Train Accuracy: 0.9712
Test Accuracy: 0.8653


#### Transformers

In [32]:
encoding_numbers = list(range(0,len(label_encoder.classes_)))
d = dict(zip(encoding_numbers, label_encoder.inverse_transform(encoding_numbers)))
d

{0: 'negative', 1: 'neutral', 2: 'positive'}

In [33]:
import evaluate

accuracy = evaluate.load("accuracy")

In [34]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [35]:
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

In [36]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)







In [37]:
from datasets import Dataset

# Combine text and label
train_data = Dataset.from_dict({"text": x_train, "label": y_train})
test_data = Dataset.from_dict({"text": x_test, "label": y_test})

# Tokenize
tokenized_train = train_data.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True), batched=True)
tokenized_test = test_data.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True), batched=True)

# Set format for PyTorch
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/14400 [00:00<?, ? examples/s]

Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

In [38]:
import os
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
login(hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [42]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=3, 
    id2label=id2label, 
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="duolingo-sentiment-distillbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=None,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6773,0.502435,0.806944
2,0.4393,0.442578,0.833889




TrainOutput(global_step=1800, training_loss=0.5283406787448459, metrics={'train_runtime': 32082.3876, 'train_samples_per_second': 0.898, 'train_steps_per_second': 0.056, 'total_flos': 3815129117491200.0, 'train_loss': 0.5283406787448459, 'epoch': 2.0})

In [43]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/rrtxt/duolingo-sentiment-distillbert/commit/c7532bcdc62cff1024aae673868dfc385096f284', commit_message='End of training', commit_description='', oid='c7532bcdc62cff1024aae673868dfc385096f284', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rrtxt/duolingo-sentiment-distillbert', endpoint='https://huggingface.co', repo_type='model', repo_id='rrtxt/duolingo-sentiment-distillbert'), pr_revision=None, pr_num=None)