In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# unduh stop words (hanya perlu dijalankan sekali)
nltk.download('stopwords')

# memuat dataset
df = pd.read_csv('IMDB Dataset.csv')

print("data awal (5 barisan pertama):")
print(df.head())
print("-" * 50)

#3. pembersihan dan pra-pemrosesan data
#konversi kolom sentiment
df['sentiment'] = df['sentiment'].map({'positive' : 1, 'negative' : 0})

#inisialisasi stop words dan stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

#fungsi untuk membersihkan data
def clean_data(text):
  #menghapus tag HTML
  text = re.sub(r'<.*?>', '', text)

  #mengubah teks menjadi huruf kecil
  text = text.lower()

  #menghapus tanda baca dan karakter non-alfabet
  text = re.sub(r'[^a-z\s]', '', text) # Corrected regex

  #tokenisasi
  tokens = text.split()

  #menghapus stop words
  tokens = [word for word in tokens if word not in stop_words]

  #steming
  tokens = [stemmer.stem(word) for word in tokens]

  #menggabungkan kembali token menjadi string
  return ' '. join(tokens)

#menerapkan fungsi pembersihan ke kolom 'review'
df['clean_review'] = df['review'].apply(clean_data) # Added apply function

print("data setelah pembersihan:")
#tampilkan kolom review asli dan clean_review yang baru
print(df[['review', 'clean_review']].head())

#tampilkan ringkasan data yang sudah bersih
print("-" * 50)
print("informasi dataframe setelah dibersihkan")
df.info()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


data awal (5 barisan pertama):
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
--------------------------------------------------
data setelah pembersihan:
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                        clean_review  
0  one review mention watch oz episod youll hook ...  
1  wonder littl product film techniqu unassum old...  
2  thought w

In [8]:
#import library untuk model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#vektorisasi teks
#inisialisasi TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

#terapkan TF-IDF ke kolom clean review
#'x' akan berisi representasi numerik dari teks
X = tfidf_vectorizer.fit_transform(df['clean_review'])
y = df['sentiment']

#membagi data menjadi data lath dan uji
#bagi data dengan perbandingan 80% dan uji 20%
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("jumlah data latih : ", x_train.shape[0])
print("jumlah data uji : ", x_test.shape[0])
print("-" * 50)

#melatih model logistic regression
#inisialisasi model
model = LogisticRegression()

#latih model menggunakan data latih
model.fit(x_train, y_train)

print("model berhasil dilatih")
print("-" * 50)

#evaluasi model
#lakukan prediksi pada data uji
y_pred = model.predict(x_test)

#hitung akurasi model
accuracy = accuracy_score(y_test, y_pred)
print(f"akurasi model: {accuracy:.4f}")

#tampilkan laporan klasifikasi lengkap
print("laporan klasifikasi:")
print(classification_report(y_test, y_pred))

jumlah data latih :  40000
jumlah data uji :  10000
--------------------------------------------------
model berhasil dilatih
--------------------------------------------------
akurasi model: 0.8838
laporan klasifikasi:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4961
           1       0.87      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



Tahap 1: Memuat dan Membersihkan Data


Tahap 2: Pemodelan dan Evaluasi

Proyek Data Science: Analisis Sentimen Ulasan Film