Step 1: Load Dataset

In [None]:
import pandas as pd

# Load dataset (replace filename if needed)
df = pd.read_csv("spam.csv", encoding='latin-1')

# Show first 5 rows to check
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Step 2: Check and Clean Data

In [None]:
# Check columns
print(df.columns)

# Usually columns are ['v1', 'v2', ...] or ['label', 'message']
# Rename for clarity
df = df.rename(columns={"v1": "label", "v2": "message"})

# Drop other extra columns if any
df = df[['label', 'message']]

# Check for missing values
print(df.isnull().sum())


Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
label      0
message    0
dtype: int64


Step 3: Prepare Data for Model
Convert the labels to numbers (spam=1, ham=0):

In [None]:
df['label_num'] = df.label.map({'ham': 0, 'spam': 1})


Split data into features and target:

In [None]:
X = df['message']   # SMS text
y = df['label_num'] # 0 or 1


Split data into train and test sets:

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Step 4: Build Pipeline with TF-IDF and Logistic Regression

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the model
model.fit(X_train, y_train)


Step 5: Test & Evaluate the Model

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on test data
y_pred = model.predict(X_test)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.967713004484305

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.99      0.77      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



Step 6: Save your predictions

In [None]:
result = X_test.to_frame()
result['actual'] = y_test
result['predicted'] = y_pred

result.to_csv("spam_predictions.csv", index=False)
print("✅ Predictions saved to spam_predictions.csv")


✅ Predictions saved to spam_predictions.csv
