Importação das bibliotecas necessárias

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report, confusion_matrix
import zipfile
import requests
from io import BytesIO


1. Coleta de Dados

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url)
with zipfile.ZipFile(BytesIO(response.content)) as thezip:
    with thezip.open('SMSSpamCollection') as thefile:
        df = pd.read_csv(thefile, sep='\t', header=None, names=['label', 'text'])

df['email_id'] = df.index

df.head(5600)


Unnamed: 0,label,text,email_id
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,2
3,ham,U dun say so early hor... U c already then say...,3
4,ham,"Nah I don't think he goes to usf, he lives aro...",4
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,5567
5568,ham,Will ü b going to esplanade fr home?,5568
5569,ham,"Pity, * was in mood for that. So...any other s...",5569
5570,ham,The guy did some bitching but I acted like i'd...,5570


2. Pré-Processamento

In [6]:
# Convertendo as labels para spam (1) e ham (0)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Limpeza dos textos: remoção de caracteres especiais e conversão para minúsculas
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)
df['text'] = df['text'].str.lower()

# Tokenização e remoção de stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['text'])

3. Treinamento do Modelo

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)

# Modelo Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Modelo SVM
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)

4. Validação e Teste

In [12]:
# Avaliação do modelo Naive Bayes
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Precision:", precision_score(y_test, y_pred_nb, zero_division=0))
print("Recall:", recall_score(y_test, y_pred_nb, zero_division=0))
print("AUC-ROC:", roc_auc_score(y_test, nb_model.predict_proba(X_test)[:, 1]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Results:
Accuracy: 0.9659192825112107
Precision: 1.0
Recall: 0.7449664429530202
AUC-ROC: 0.9825753470340574
Confusion Matrix:
 [[966   0]
 [ 38 111]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.74      0.85       149

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [None]:
# Avaliação do modelo SVM
y_pred_svm = svm_model.predict(X_test)
print("\nSVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm, zero_division=0))
print("Recall:", recall_score(y_test, y_pred_svm, zero_division=0))
print("AUC-ROC:", roc_auc_score(y_test, svm_model.predict_proba(X_test)[:, 1]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))


SVM Results:
Accuracy: 0.9838565022421525
Precision: 1.0
Recall: 0.8791946308724832
AUC-ROC: 0.991086192282574
Confusion Matrix:
 [[966   0]
 [ 18 131]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.88      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



5. Validação Cruzada (Opcional)

In [22]:
# Usando validação cruzada com 5 folds para o modelo Naive Bayes
scores_nb = cross_val_score(nb_model, X, df['label'], cv=5, scoring='accuracy')
print("\nValidação Cruzada (Naive Bayes):")
print("Acurácia média com validação cruzada (5 folds):", scores_nb.mean())

# Usando validação cruzada com 5 folds para o modelo SVM
scores_svm = cross_val_score(svm_model, X, df['label'], cv=5, scoring='accuracy')
print("\nValidação Cruzada (SVM):")
print("Acurácia média com validação cruzada (5 folds):", scores_svm.mean())


Validação Cruzada (Naive Bayes):
Acurácia média com validação cruzada (5 folds): 0.9623107454251233

Validação Cruzada (SVM):
Acurácia média com validação cruzada (5 folds): 0.9752324673338111
