### **Tugas 2**

A. Buatlah model klasfikasi Multinomial Naive Bayes dengan ketentuan,

1. Menggunakan data spam.csv

2. Fitur CountVectorizer dengan mengaktifkan stop_words

3. Evaluasi hasilnya

In [None]:
from google.colab import files
uploaded = files.upload()

Saving spam.csv to spam.csv


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')


In [None]:
df = df.drop(df.iloc[:,2:], axis=1)

In [None]:
# 2. Mengubah nama kolom v1 (label) dan v2 (teks SMS)
df = df.rename(columns={'v1': 'Labels', 'v2': 'SMS'})

In [None]:
# 3. Encoding Label: 'ham' -> 0, 'spam' -> 1
new_labels = {'spam': 1, 'ham': 0}
df['Labels'] = df['Labels'].map(new_labels)

In [None]:
# 4. Memisahkan Fitur (X) dan Label (y)
X = df['SMS'].values
y = df['Labels'].values

In [None]:
# --- C. Pembagian Data ---
# Membagi data menjadi training (80%) dan testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

print(f"Data Training: {X_train.shape[0]} baris")
print(f"Data Testing: {X_test.shape[0]} baris")

Data Training: 4457 baris
Data Testing: 1115 baris


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# --- A. Inisiasi CountVectorizer dengan stop_words ---
# 'english' akan menghapus kata-kata umum seperti 'the', 'is', 'a', dll.
vectorizer = CountVectorizer(stop_words='english')

# --- B. Transformasi Data Training ---
# Fit_transform digunakan pada data training untuk mempelajari (fitting)
# kosakata unik dan kemudian mengubahnya menjadi matriks hitungan kata (transform)
X_train_vec = vectorizer.fit_transform(X_train)

# --- C. Transformasi Data Testing ---
# Hanya transform digunakan pada data testing. Kita hanya menggunakan kosakata
# yang sudah dipelajari dari data training agar tidak terjadi kebocoran informasi (data leakage).
X_test_vec = vectorizer.transform(X_test)

print(f"Jumlah Fitur (Kosakata Unik): {len(vectorizer.get_feature_names_out())}")
print(f"Dimensi Vektor Training: {X_train_vec.shape}")

Jumlah Fitur (Kosakata Unik): 7466
Dimensi Vektor Training: (4457, 7466)


In [None]:
from sklearn.naive_bayes import MultinomialNB

# Inisiasi Model
mnb = MultinomialNB()

# Melatih (Fit) Model dengan data training yang sudah divelktorisasi
mnb.fit(X_train_vec, y_train)

print("Model Multinomial Naive Bayes berhasil dilatih.")

Model Multinomial Naive Bayes berhasil dilatih.


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# --- A. Prediksi Data Uji ---
y_pred_test = mnb.predict(X_test_vec)

# --- B. Evaluasi Akurasi ---
acc_test = accuracy_score(y_test, y_pred_test)

# --- C. Classification Report ---
report = classification_report(y_test, y_pred_test, target_names=['ham', 'spam'])

print(f'\n======================================================')
print(f'   HASIL AKURASI DATA UJI: {acc_test:.4f} ({acc_test*100:.2f}%)')
print(f'======================================================\n')
print("Classification Report:")
print(report)


   HASIL AKURASI DATA UJI: 0.9830 (98.30%)

Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       954
        spam       0.98      0.90      0.94       161

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


In [None]:
# --- Data Loading and Preprocessing (Same as before) ---
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
df = df.rename(columns={'v1': 'label', 'v2': 'message'})
df['label_encoded'] = df['label'].map({'ham': 0, 'spam': 1})
X = df['message']
y = df['label_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# --- 1. Model dengan Fitur TF-IDF (Term Frequency-Inverse Document Frequency) ---
print("--- MODEL MULTINOMIAL NAIVE BAYES DENGAN FITUR TF-IDF ---")

# Inisiasi TfidfVectorizer dengan stop_words diaktifkan
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit dan Transform data training
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform data testing
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Inisiasi dan Training Model
nb_model_tfidf = MultinomialNB()
nb_model_tfidf.fit(X_train_tfidf, y_train)

# Prediksi dan Evaluasi
y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)


--- MODEL MULTINOMIAL NAIVE BAYES DENGAN FITUR TF-IDF ---


In [None]:

# Rekam hasil TF-IDF
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
report_tfidf = classification_report(y_test, y_pred_tfidf, target_names=['ham', 'spam'], output_dict=True)

print(f"Akurasi (TF-IDF): {accuracy_tfidf}")
print("\nClassification Report (TF-IDF):")
print(classification_report(y_test, y_pred_tfidf, target_names=['ham', 'spam']))


Akurasi (TF-IDF): 0.9668161434977578

Classification Report (TF-IDF):
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [None]:

# --- 2. Data Hasil dari Tugas No 2 (CountVectorizer) untuk Perbandingan ---
# Hasil dari eksekusi sebelumnya (random_state=42):
accuracy_cv = 0.9838565022421525
precision_spam_cv = 0.96
recall_spam_cv = 0.92

# Rekam hasil TF-IDF untuk perbandingan
precision_spam_tfidf = report_tfidf['spam']['precision']
recall_spam_tfidf = report_tfidf['spam']['recall']

# --- 3. Perbandingan dan Kesimpulan ---
print("\n=========================================================================")
print("                   PERBANDINGAN HASIL EVALUASI")
print("=========================================================================")
comparison_data = {
    'Metode Fitur': ['CountVectorizer', 'TF-IDF'],
    'Akurasi': [accuracy_cv, accuracy_tfidf],
    'Precision (Spam)': [precision_spam_cv, precision_spam_tfidf],
    'Recall (Spam)': [recall_spam_cv, recall_spam_tfidf]
}
comparison_df = pd.DataFrame(comparison_data)
print(comparison_df)


                   PERBANDINGAN HASIL EVALUASI
      Metode Fitur   Akurasi  Precision (Spam)  Recall (Spam)
0  CountVectorizer  0.983857              0.96       0.920000
1           TF-IDF  0.966816              1.00       0.753333
