In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import numpy as np
import nltk

In [3]:
# Load dataset
data = pd.read_csv('IMDB Dataset.csv')

In [5]:
# Text preprocess
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

In [7]:
# text cleaning dan tokenization
data['cleaned_review'] = data['review'].apply(clean_text)
data['tokenized_review'] = data['cleaned_review'].apply(tokenize_and_remove_stopwords)

In [8]:
# Split data train, val, test
X = data[['review', 'cleaned_review', 'tokenized_review']]
y = data['sentiment']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [9]:
# Word2Vec model
w2v_model = Word2Vec(sentences=X_train['tokenized_review'], vector_size=100, window=5, min_count=2, workers=4, sg=1)

In [10]:
# fungsi konversi w2v jadi fitur vektor
def text_to_w2v_vector(tokens, model, vector_size):
    vec = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count != 0:
        vec /= count
    return vec

In [11]:
# Convert datasets to Word2Vec feature vectors
X_train_w2v = np.array([text_to_w2v_vector(tokens, w2v_model, 100) for tokens in X_train['tokenized_review']])
X_val_w2v = np.array([text_to_w2v_vector(tokens, w2v_model, 100) for tokens in X_val['tokenized_review']])
X_test_w2v = np.array([text_to_w2v_vector(tokens, w2v_model, 100) for tokens in X_test['tokenized_review']])

In [12]:
# Train Logistic Regression model
model_w2v = LogisticRegression(max_iter=1000)
model_w2v.fit(X_train_w2v, y_train)

In [13]:
print("Experiment: Word2Vec Features")

# Evaluate on validation set
y_val_w2v_pred = model_w2v.predict(X_val_w2v)
print("Validation Set Evaluation:")
print(classification_report(y_val, y_val_w2v_pred))

# Evaluate on test set
y_test_w2v_pred = model_w2v.predict(X_test_w2v)
print("Test Set Evaluation:")
print(classification_report(y_test, y_test_w2v_pred))

Experiment: Word2Vec Features
Validation Set Evaluation:
              precision    recall  f1-score   support

    negative       0.88      0.87      0.87      2499
    positive       0.87      0.88      0.88      2501

    accuracy                           0.88      5000
   macro avg       0.88      0.87      0.87      5000
weighted avg       0.88      0.88      0.87      5000

Test Set Evaluation:
              precision    recall  f1-score   support

    negative       0.87      0.87      0.87      2462
    positive       0.87      0.88      0.88      2538

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000

