In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv("spam.csv", encoding="latin-1")[['v1','v2']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham':0, 'spam':1})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   int64 
 1   text    5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [5]:
df.head(10)

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [16]:
model = LogisticRegression(class_weight='balanced')
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

In [17]:
y_pred = model.predict(X_test_vec)

print("Predictions:", y_pred[:15])
print("Actual:", y_test.values[:15])


Predictions: [0 0 0 0 1 0 0 0 0 0 0 1 0 0 0]
Actual: [0 0 1 0 1 0 0 0 0 0 0 1 0 0 0]


In [19]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[954  11]
 [ 14 136]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.93      0.91      0.92       150

    accuracy                           0.98      1115
   macro avg       0.96      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [15]:
# custom message to check prediction
messages = ["You have won a free lottery ticket!", 
            "Hi, can we meet tomorrow for coffee?"]

msgs_vec = vectorizer.transform(messages)
print(model.predict(msgs_vec))

[1 0]
