<a href="https://colab.research.google.com/github/sandhiya-git-hub/AI-Tasks/blob/main/Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
RANDOM_STATE = 44

In [None]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

In [None]:
data['label_num'] = data['label'].map({'ham': 0, 'spam': 1})

In [None]:
TARGET_N = 2000
if len(data) < TARGET_N:
    raise ValueError(f"Dataset has only {len(data)} messages — need at least {TARGET_N}.")

In [None]:
prop = data['label'].value_counts(normalize=True)
n_spam = int(round(prop['spam'] * TARGET_N))
n_ham = TARGET_N - n_spam
sampled_spam = data[data['label']=='spam'].sample(n=n_spam, random_state=RANDOM_STATE)
sampled_ham  = data[data['label']=='ham'].sample(n=n_ham, random_state=RANDOM_STATE)
sampled = pd.concat([sampled_ham, sampled_spam]).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

In [None]:
X = sampled['message']
y = sampled['label_num']
X_rest, X_test, y_rest, y_test = train_test_split(X, y, test_size=500, stratify=y, random_state=RANDOM_STATE)
X_train, y_train = X_rest, y_rest

In [None]:
print("Train size:", len(X_train), "Test size:", len(X_test))
print("Spam ratio in train:", y_train.mean(), " Spam ratio in test:", y_test.mean())

Train size: 1500 Test size: 500
Spam ratio in train: 0.134  Spam ratio in test: 0.134


In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=2000, random_state=RANDOM_STATE)
model.fit(X_train_tfidf, y_train)

In [None]:
LogisticRegression(max_iter=2000, random_state=44)

In [None]:
y_pred = model.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", acc)


Model Accuracy: 0.916


In [None]:
print("\nCategories: ['Not Spam', 'Spam']\n")
sample_messages = X_test.reset_index(drop=True)[:400]
sample_vectors = tfidf.transform(sample_messages)
probabilities = model.predict_proba(sample_vectors)


Categories: ['Not Spam', 'Spam']



In [None]:
for i, prob in enumerate(probabilities, start=1):
    spam_prob = prob[1]
    prediction = "SPAM" if spam_prob > 0.5 else "NOT SPAM"
    print(f"Email {i}: Sigmoid Output = {spam_prob:.4f} ({spam_prob*100:.1f}%) -> Prediction: {prediction}")

Email 1: Sigmoid Output = 0.0685 (6.9%) -> Prediction: NOT SPAM
Email 2: Sigmoid Output = 0.0779 (7.8%) -> Prediction: NOT SPAM
Email 3: Sigmoid Output = 0.0510 (5.1%) -> Prediction: NOT SPAM
Email 4: Sigmoid Output = 0.0758 (7.6%) -> Prediction: NOT SPAM
Email 5: Sigmoid Output = 0.0734 (7.3%) -> Prediction: NOT SPAM
Email 6: Sigmoid Output = 0.0381 (3.8%) -> Prediction: NOT SPAM
Email 7: Sigmoid Output = 0.1679 (16.8%) -> Prediction: NOT SPAM
Email 8: Sigmoid Output = 0.0775 (7.7%) -> Prediction: NOT SPAM
Email 9: Sigmoid Output = 0.0520 (5.2%) -> Prediction: NOT SPAM
Email 10: Sigmoid Output = 0.0953 (9.5%) -> Prediction: NOT SPAM
Email 11: Sigmoid Output = 0.0758 (7.6%) -> Prediction: NOT SPAM
Email 12: Sigmoid Output = 0.0451 (4.5%) -> Prediction: NOT SPAM
Email 13: Sigmoid Output = 0.0607 (6.1%) -> Prediction: NOT SPAM
Email 14: Sigmoid Output = 0.0520 (5.2%) -> Prediction: NOT SPAM
Email 15: Sigmoid Output = 0.0600 (6.0%) -> Prediction: NOT SPAM
Email 16: Sigmoid Output = 0.0290

In [None]:
print("\n" + "="*60 + "\nSample test message, true label, spam-probability:\n")
for i in range(5):
    msg = sample_messages.iloc[i]
    true_label = "SPAM" if y_test.reset_index(drop=True).iloc[i]==1 else "NOT SPAM"
    spam_prob = probabilities[i][1]
    print(f"Message {i+1} (True: {true_label}, Prob(spam)={spam_prob:.3f}):\n  {msg}\n")


Sample test message, true label, spam-probability:

Message 1 (True: NOT SPAM, Prob(spam)=0.069):
  Thanx 4 sending me home...

Message 2 (True: NOT SPAM, Prob(spam)=0.078):
  Then i buy.

Message 3 (True: NOT SPAM, Prob(spam)=0.051):
  His frens go then he in lor. Not alone wif my mum n sis lor.

Message 4 (True: NOT SPAM, Prob(spam)=0.076):
  What year. And how many miles.

Message 5 (True: NOT SPAM, Prob(spam)=0.073):
  What can i do? Might accidant tookplace between somewhere ghodbandar rd. Traffic moves slovely. So plz slip &amp; don't worry.

