# 1.	Buatlah model klasifikasi dengan menggunakan SVM untuk data suara, voice.csv.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load the voice data
voice_df = pd.read_csv("voice.csv")

# Prepare the data
X = voice_df.drop("label", axis=1)  # Features
y = voice_df["label"]  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and train the model
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Prediksi dengan data training
y_train_pred = model.predict(X_train)

# Evaluasi akurasi prediksi training
acc_train = accuracy_score(y_train, y_train_pred)

# Prediksi dengan data test
y_test_pred = model.predict(X_test)

# Evaluasi akurasi prediksi test
acc_test = accuracy_score(y_test, y_test_pred)

# Print hasil evaluasi
print(f'Hasil akurasi data train: {acc_train * 100:.2f}%')
print(f'Hasil akurasi data test: {acc_test * 100:.2f}%')

# Print laporan klasifikasi data test
print(f'Laporan klasifikasi data test\n {classification_report(y_test, y_test_pred)}')

Hasil akurasi data train: 97.88%
Hasil akurasi data test: 97.06%
Laporan klasifikasi data test
               precision    recall  f1-score   support

      female       0.97      0.97      0.97       452
        male       0.97      0.97      0.97       499

    accuracy                           0.97       951
   macro avg       0.97      0.97      0.97       951
weighted avg       0.97      0.97      0.97       951



# 2.	Buatlah model klasfikasi Multinomial Naive Bayes dengan ketentuan,
1. Menggunakan data spam.csv
2.	Fitur CountVectorizer dengan mengaktifkan stop_words
3.	Evaluasi hasilnya


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load the spam data
spam_df = pd.read_csv("spam.csv", encoding="latin-1")

# Rename columns for easier access
spam_df = spam_df.rename(columns={"v1": "label", "v2": "message"})

# Prepare the data
X = spam_df["message"]
y = spam_df["label"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
bow = CountVectorizer(stop_words='english')
X_train = bow.fit_transform(X_train)
X_test = bow.transform(X_test)

# Train the Multinomial Naive Bayes model
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Predict with training data
y_train_pred_bow = mnb.predict(X_train)

# Calculate training accuracy
acc_train_bow = accuracy_score(y_train, y_train_pred_bow)

# Predict with test data
y_test_pred_bow = mnb.predict(X_test)

# Calculate test accuracy
acc_test_bow = accuracy_score(y_test, y_test_pred_bow)

# Generate classification report
report_bow = classification_report(y_test, y_test_pred_bow)

# Print evaluation results
print(f'Hasil akurasi data train: {acc_train_bow * 100:.2f}%')
print(f'Hasil akurasi data test: {acc_test_bow * 100:.2f}%')
print(f'Laporan klasifikasi data test\n {report_bow}')

Hasil akurasi data train: 99.46%
Hasil akurasi data test: 98.39%
Laporan klasifikasi data test
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.96      0.92      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# 3.	Buatlah model klasfikasi Multinomial Naive Bayes dengan ketentuan,
1.	Menggunakan data spam.csv
2.	Fitur TF-IDF dengan mengaktifkan stop_words
3.	Evaluasi hasilnya dan bandingkan dengan hasil pada Tugas no 2.
4.	Berikan kesimpulan fitur mana yang terbaik pada kasus data spam.csv


In [None]:
# SOAL 3

from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tf_idf = TfidfVectorizer(stop_words='english')

X_train = tf_idf.fit_transform(X_train)
X_test = tf_idf.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Prediksi dengan data training
y_train_pred_tf_idf = mnb.predict(X_train)

# Hasil akurasi data training
acc_train_tf_idf = accuracy_score(y_train, y_train_pred_tf_idf)

# Prediksi dengan data test
y_test_pred_tf_idf = mnb.predict(X_test)

# Hasil akurasi data test
acc_test_tf_idf = accuracy_score(y_test, y_test_pred_tf_idf)

# Laporan klasifikasi
report_tf_idf = classification_report(y_test, y_test_pred_tf_idf)

# Print perbandingan hasil evaluasi
print(f'Hasil akurasi data train menggunakan TF-IDF: {acc_train_tf_idf*100:.2f}%')
print(f'Hasil akurasi data train menggunakan CountVectorizer: {acc_train_bow*100:.2f}%')
print(f'Hasil akurasi data test menggunakan TF-IDF: {acc_test_tf_idf*100:.2f}%')
print(f'Hasil akurasi data test menggunakan CountVectorizer: {acc_test_bow*100:.2f}%')

# Print laporan klasifikasi data test
print('\n')
print(f'Laporan klasifikasi data test menggunakan TF-IDF\n {report_tf_idf}\n')
print(f'Laporan klasifikasi data test menggunakan CountVectorizer\n {report_bow}')

Hasil akurasi data train menggunakan TF-IDF: 98.38%
Hasil akurasi data train menggunakan CountVectorizer: 99.46%
Hasil akurasi data test menggunakan TF-IDF: 96.68%
Hasil akurasi data test menggunakan CountVectorizer: 98.39%


Laporan klasifikasi data test menggunakan TF-IDF
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Laporan klasifikasi data test menggunakan CountVectorizer
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.96      0.92      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

