In [3]:
!pip install gensim
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from gensim.models import Word2Vec




In [4]:
# ==============================================
# 1. LOAD DATA
# ==============================================
df = pd.read_csv("labeling spreadsheet - netflix_reviews_cleaned_final.csv")
df['content'] = df['content'].astype(str)



In [5]:
# ==============================================
# 2. PREPROCESSING
# ==============================================
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("indonesian"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df['clean_text'] = df['content'].apply(preprocess)
sentences = [t.split() for t in df['clean_text']]



X = df['clean_text']
y = df['score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
# ==============================================
# 4. EKSTRAKSI FITUR
# ==============================================

# (1) TF-IDF
tfidf_vec = TfidfVectorizer()
X_train_tfidf = tfidf_vec.fit_transform(X_train)
X_test_tfidf = tfidf_vec.transform(X_test)

# (2) BoW
bow_vec = CountVectorizer()
X_train_bow = bow_vec.fit_transform(X_train)
X_test_bow = bow_vec.transform(X_test)

# (3) Word2Vec
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)

def to_w2v(text):
    words = text.split()
    vecs = [w2v_model.wv[w] for w in words if w in w2v_model.wv]
    if len(vecs) == 0:
        return np.zeros(100)
    return np.mean(vecs, axis=0)

X_train_w2v = np.array([to_w2v(t) for t in X_train])
X_test_w2v = np.array([to_w2v(t) for t in X_test])

# (4) GloVe = Word2Vec
def to_glove_mini(text):
    return to_w2v(text)

X_train_glove = X_train_w2v
X_test_glove = X_test_w2v


In [7]:

# ==============================================
# 5. EMPAT PERCOBAAN MODEL
# ==============================================

print("\n=== Percobaan 1: TF-IDF + Naive Bayes ===")
m1 = MultinomialNB()
m1.fit(X_train_tfidf, y_train)
p1 = m1.predict(X_test_tfidf)
print(classification_report(y_test, p1))


print("\n=== Percobaan 2: BoW + Naive Bayes ===")
m2 = MultinomialNB()
m2.fit(X_train_bow, y_train)
p2 = m2.predict(X_test_bow)
print(classification_report(y_test, p2))


print("\n=== Percobaan 3: Word2Vec + SVM ===")
m3 = SVC(kernel="linear")
m3.fit(X_train_w2v, y_train)
p3 = m3.predict(X_test_w2v)
print(classification_report(y_test, p3))


print("\n=== Percobaan 4: GloVe + SVM ===")
m4 = SVC(kernel="linear")
m4.fit(X_train_glove, y_train)
p4 = m4.predict(X_test_glove)
print(classification_report(y_test, p4))



=== Percobaan 1: TF-IDF + Naive Bayes ===
              precision    recall  f1-score   support

           1       0.64      1.00      0.78       113
           2       0.00      0.00      0.00        13
           3       0.00      0.00      0.00        12
           4       0.00      0.00      0.00        11
           5       0.83      0.33      0.47        46

    accuracy                           0.66       195
   macro avg       0.29      0.27      0.25       195
weighted avg       0.57      0.66      0.56       195


=== Percobaan 2: BoW + Naive Bayes ===
              precision    recall  f1-score   support

           1       0.67      0.96      0.79       113
           2       0.00      0.00      0.00        13
           3       0.33      0.08      0.13        12
           4       0.00      0.00      0.00        11
           5       0.72      0.46      0.56        46

    accuracy                           0.67       195
   macro avg       0.35      0.30      0.30     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
