**DENGAN OPTIMASI**

In [2]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# 1. Load data
df = pd.read_csv('dataset_final_tes.csv')

# 2. Gabungkan fitur teks dari kolom 'name', 'description', dan 'themes'
df['text'] = (
    df['name'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['themes'].fillna('')
)

# 3. Siapkan fitur (X) dan target (y)
X = df['text']
le = LabelEncoder()
y = le.fit_transform(df['genre'])

# 4. Bagi data menjadi training dan testing (80%-20%) dengan stratifikasi
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Buat Pipeline: TF-IDF Vectorizer + Multinomial Naive Bayes
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),  # Menggunakan unigrams dan bigrams
    ('clf', MultinomialNB())
])

# 6. Definisikan parameter grid untuk pencarian hyperparameter
param_grid = {
    'tfidf__max_features': [5000, 10000],
    'clf__alpha': [0.5, 1.0, 1.5]  # nilai smoothing parameter alpha
}

# 7. Gunakan GridSearchCV dengan 5-fold cross validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Catat waktu mulai
start_time = time.time()

# Lakukan pencarian hyperparameter
grid_search.fit(X_train, y_train)

# Catat waktu selesai dan hitung durasinya
end_time = time.time()
processing_duration = end_time - start_time

# Cetak informasi durasi pemrosesan
print("Durasi pemrosesan Grid Search (detik):", processing_duration)

# Cetak parameter terbaik
print("Best parameters:", grid_search.best_params_)

# 8. Evaluasi model pada data testing menggunakan model terbaik
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nAkurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Durasi pemrosesan Grid Search (detik): 32.2230167388916
Best parameters: {'clf__alpha': 1.5, 'tfidf__max_features': 10000}

Akurasi: 0.8369351669941061

Classification Report:
              precision    recall  f1-score   support

      Comedy       0.81      0.78      0.80       509
       Drama       0.82      0.79      0.80       509
      Horror       0.88      0.94      0.91       509

    accuracy                           0.84      1527
   macro avg       0.84      0.84      0.84      1527
weighted avg       0.84      0.84      0.84      1527



**TANPA OPTIMASI**

In [2]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# 1. Load data
df = pd.read_csv('dataset_final_tes.csv')

# 2. Gabungkan teks dari kolom 'name', 'description', dan 'themes'
df['text'] = (
    df['name'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['themes'].fillna('')
)

# 3. Siapkan fitur (X) dan target (y)
X = df['text']
le = LabelEncoder()
y = le.fit_transform(df['genre'])

# 4. Bagi data menjadi training dan testing (80%-20%) dengan stratifikasi
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Buat Pipeline: TF-IDF Vectorizer + Multinomial Naive Bayes (default params)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  
    ('clf', MultinomialNB())
])

# 6. Ukur durasi pemrosesan saat fitting
start_time = time.time()
pipeline.fit(X_train, y_train)
end_time = time.time()
print("Durasi pemrosesan fit() (detik):", end_time - start_time)

# 7. Evaluasi model pada data testing
y_pred = pipeline.predict(X_test)
print("\nAkurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Durasi pemrosesan fit() (detik): 0.27808165550231934

Akurasi: 0.822527832351015

Classification Report:
              precision    recall  f1-score   support

      Comedy       0.80      0.77      0.78       509
       Drama       0.81      0.75      0.78       509
      Horror       0.86      0.94      0.90       509

    accuracy                           0.82      1527
   macro avg       0.82      0.82      0.82      1527
weighted avg       0.82      0.82      0.82      1527

