**DENGAN OPTIMASI**

In [3]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# 1. Load data
df = pd.read_csv('dataset_final_tes.csv')

# 2. Gabungkan fitur teks dari kolom 'name', 'description', dan 'themes'
df['text'] = (
    df['name'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['themes'].fillna('')
)

# 3. Siapkan fitur (X) dan target (y)
X = df['text']
le = LabelEncoder()
y = le.fit_transform(df['genre'])

# 4. Bagi data menjadi training dan testing (80%-20%) dengan stratifikasi
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Buat Pipeline: TF-IDF Vectorizer + Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),  # Menggunakan unigrams dan bigrams
    ('clf', LogisticRegression(max_iter=1000))
])

# 6. Definisikan parameter grid untuk pencarian hyperparameter
param_grid = {
    'tfidf__max_features': [5000, 10000],
    'clf__C': [0.1, 1, 10]  # Parameter regularisasi, semakin kecil nilai C maka regularisasi semakin kuat
}

# 7. Gunakan GridSearchCV dengan 5-fold cross validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Mulai pencatatan waktu pemrosesan
start_time = time.time()

# Lakukan pencarian hyperparameter
grid_search.fit(X_train, y_train)

# Catat waktu selesai dan hitung durasi pemrosesan
end_time = time.time()
processing_duration = end_time - start_time

# Cetak informasi durasi pemrosesan grid search
print("Durasi pemrosesan Grid Search (detik):", processing_duration)

# Cetak parameter terbaik
print("Best parameters:", grid_search.best_params_)

# 8. Evaluasi model pada data testing menggunakan model terbaik
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nAkurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Durasi pemrosesan Grid Search (detik): 25.629907608032227
Best parameters: {'clf__C': 1, 'tfidf__max_features': 10000}

Akurasi: 0.8356254092992796

Classification Report:
              precision    recall  f1-score   support

      Comedy       0.81      0.78      0.80       509
       Drama       0.80      0.81      0.80       509
      Horror       0.90      0.92      0.91       509

    accuracy                           0.84      1527
   macro avg       0.84      0.84      0.84      1527
weighted avg       0.84      0.84      0.84      1527



**TANPA OPTIMASI**

In [4]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# 1. Load data
df = pd.read_csv('dataset_final_tes.csv')

# 2. Gabungkan teks dari kolom 'name', 'description', dan 'themes'
df['text'] = (
    df['name'].fillna('') + ' ' +
    df['description'].fillna('') + ' ' +
    df['themes'].fillna('')
)

# 3. Siapkan fitur (X) dan target (y)
X = df['text']
le = LabelEncoder()
y = le.fit_transform(df['genre'])

# 4. Bagi data menjadi training dan testing (80%-20%) dengan stratifikasi
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Buat Pipeline: TF-IDF Vectorizer + Logistic Regression (default params)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),            # gunakan parameter default
    ('clf', LogisticRegression(max_iter=1000))  # default C=1.0
])

# 6. Ukur durasi pemrosesan saat fitting
start_time = time.time()
pipeline.fit(X_train, y_train)
end_time = time.time()
print("Durasi pemrosesan fit() (detik):", end_time - start_time)

# 7. Evaluasi model pada data testing
y_pred = pipeline.predict(X_test)
print("\nAkurasi:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Durasi pemrosesan fit() (detik): 1.6893579959869385

Akurasi: 0.8356254092992796

Classification Report:
              precision    recall  f1-score   support

      Comedy       0.80      0.78      0.79       509
       Drama       0.80      0.81      0.80       509
      Horror       0.90      0.92      0.91       509

    accuracy                           0.84      1527
   macro avg       0.83      0.84      0.84      1527
weighted avg       0.83      0.84      0.84      1527

