# **Tugas 2**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# ---------------------------------------------------------------------------
# 1. Memuat dan Mempersiapkan Data
# ---------------------------------------------------------------------------
try:
    # File spam.csv ini umumnya memerlukan encoding 'latin-1'
    file_path = '/content/spam.csv'
    df = pd.read_csv(file_path, encoding='latin-1', usecols=['v1', 'v2'])

    # Ganti nama kolom agar lebih mudah dibaca
    df.columns = ['label', 'message']

    # Encoding label: ham = 0, spam = 1
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})

    print(f"Dataset berhasil dimuat. Total {df.shape[0]} baris.")
    print(df.head())

    # Pisahkan Fitur (X) dan Target (y)
    X = df['message']
    y = df['label']

    # Bagi data menjadi data latih dan data uji (80% latih, 20% uji)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    print(f"\nData dibagi: {len(X_train)} latih, {len(X_test)} uji.")

    # ---------------------------------------------------------------------------
    # TUGAS 1: Model dengan CountVectorizer
    # ---------------------------------------------------------------------------
    print("\n--- TUGAS 1: CountVectorizer + Multinomial Naive Bayes ---")

    # 1. Inisialisasi CountVectorizer (dengan stop_words)
    count_vec = CountVectorizer(stop_words='english')

    # 2. Fit dan transform data latih
    X_train_cv = count_vec.fit_transform(X_train)

    # 3. Transform data uji (HANYA transform, tidak fit)
    X_test_cv = count_vec.transform(X_test)

    # 4. Latih model Multinomial Naive Bayes
    model_cv = MultinomialNB()
    model_cv.fit(X_train_cv, y_train)

    # 5. Lakukan prediksi
    y_pred_cv = model_cv.predict(X_test_cv)

    # 6. Evaluasi model
    accuracy_cv = accuracy_score(y_test, y_pred_cv)
    report_cv = classification_report(y_test, y_pred_cv, target_names=['ham', 'spam'])
    cm_cv = confusion_matrix(y_test, y_pred_cv)

    print(f"Akurasi (CountVectorizer): {accuracy_cv:.4f}")
    print("Laporan Klasifikasi (CountVectorizer):")
    print(report_cv)
    print("Confusion Matrix (CountVectorizer):")
    print(cm_cv)

    # ---------------------------------------------------------------------------
    # TUGAS 2: Model dengan TF-IDF Vectorizer
    # ---------------------------------------------------------------------------
    print("\n--- TUGAS 2: TF-IDF Vectorizer + Multinomial Naive Bayes ---")

    # 1. Inisialisasi TfidfVectorizer (dengan stop_words)
    tfidf_vec = TfidfVectorizer(stop_words='english')

    # 2. Fit dan transform data latih
    X_train_tfidf = tfidf_vec.fit_transform(X_train)

    # 3. Transform data uji
    X_test_tfidf = tfidf_vec.transform(X_test)

    # 4. Latih model Multinomial Naive Bayes
    model_tfidf = MultinomialNB()
    model_tfidf.fit(X_train_tfidf, y_train)

    # 5. Lakukan prediksi
    y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

    # 6. Evaluasi model
    accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
    report_tfidf = classification_report(y_test, y_pred_tfidf, target_names=['ham', 'spam'])
    cm_tfidf = confusion_matrix(y_test, y_pred_tfidf)

    print(f"Akurasi (TF-IDF): {accuracy_tfidf:.4f}")
    print("Laporan Klasifikasi (TF-IDF):")
    print(report_tfidf)
    print("Confusion Matrix (TF-IDF):")
    print(cm_tfidf)

    # ---------------------------------------------------------------------------
    # TUGAS 3: Perbandingan dan Kesimpulan
    # ---------------------------------------------------------------------------
    print("\n--- TUGAS 3: Perbandingan Hasil ---")
    print("======================================================")
    print(f" METRIK \t\t| CountVectorizer \t| TF-IDF")
    print("------------------------------------------------------")
    print(f" Akurasi \t\t| {accuracy_cv:.4f} \t\t| {accuracy_tfidf:.4f}")

    # Ekstrak F1-Score untuk 'spam' (label 1)
    f1_cv_spam = classification_report(y_test, y_pred_cv, output_dict=True)['spam']['f1-score']
    f1_tfidf_spam = classification_report(y_test, y_pred_tfidf, output_dict=True)['spam']['f1-score']

    print(f" F1-Score (Spam) \t| {f1_cv_spam:.4f} \t\t| {f1_tfidf_spam:.4f}")

    # Ekstrak Recall untuk 'spam' (label 1)
    recall_cv_spam = classification_report(y_test, y_pred_cv, output_dict=True)['spam']['recall']
    recall_tfidf_spam = classification_report(y_test, y_pred_tfidf, output_dict=True)['spam']['recall']

    print(f" Recall (Spam) \t\t| {recall_cv_spam:.4f} \t\t| {recall_tfidf_spam:.4f}")

    # Ekstrak Precision untuk 'spam' (label 1)
    precision_cv_spam = classification_report(y_test, y_pred_cv, output_dict=True)['spam']['precision']
    precision_tfidf_spam = classification_report(y_test, y_pred_tfidf, output_dict=True)['spam']['precision']

    print(f" Precision (Spam) \t| {precision_cv_spam:.4f} \t\t| {precision_tfidf_spam:.4f}")
    print("======================================================")


except FileNotFoundError:
    print(f"Error: File '{file_path}' tidak ditemukan.")
except Exception as e:
    print(f"Terjadi kesalahan: {e}")

Dataset berhasil dimuat. Total 5572 baris.
   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...

Data dibagi: 4457 latih, 1115 uji.

--- TUGAS 1: CountVectorizer + Multinomial Naive Bayes ---
Akurasi (CountVectorizer): 0.9839
Laporan Klasifikasi (CountVectorizer):
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       966
        spam       0.96      0.92      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix (CountVectorizer):
[[960   6]
 [ 12 137]]

--- TUGAS 2: TF-IDF Vectorizer + Multinom