<a href="https://colab.research.google.com/github/pra08528/CODSOFT_ML_04/blob/main/code_sms_spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!kaggle datasets download -d uciml/sms-spam-collection-dataset

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
Downloading sms-spam-collection-dataset.zip to /content
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 16.3MB/s]


In [3]:
!unzip sms-spam-collection-dataset.zip

Archive:  sms-spam-collection-dataset.zip
  inflating: spam.csv                


In [7]:
import zipfile
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# Step 1: Extract the Dataset
zip_file_path = '/content/sms-spam-collection-dataset.zip'
extract_folder_path = '/content/sms-spam-collection-dataset/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder_path)

# Verify the extracted files
extracted_files = os.listdir(extract_folder_path)
print("Extracted files:", extracted_files)

# Step 2: Load the Dataset
csv_file_name = 'spam.csv'  # File name
csv_file_path = os.path.join(extract_folder_path, csv_file_name)

# Check if file exists before loading
if os.path.isfile(csv_file_path):
    try:
        # Load the dataset with the correct delimiter and encoding
        df = pd.read_csv(csv_file_path, encoding='ISO-8859-1')

        # Select only the relevant columns and rename them
        df = df[['v1', 'v2']]
        df.columns = ['label', 'message']

        print(df.head())  # Inspect the first few rows
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e}")
else:
    raise FileNotFoundError(f"The file {csv_file_path} does not exist.")

# Step 3: Data Preprocessing
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Feature Extraction
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Model Training
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Step 6: Evaluation
y_pred = model.predict(X_test_tfidf)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

# Optional: Save and Load the Model
joblib.dump(model, 'spam_sms_model.pkl')
print("Model saved as 'spam_sms_model.pkl'")

# Load the model (for demonstration purposes)
loaded_model = joblib.load('spam_sms_model.pkl')


Extracted files: ['spam.csv']
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Accuracy: 0.9668161434977578
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Model saved as 'spam_sms_model.pkl'
