In [106]:
import pandas as pd 
import matplotlib as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import HashingVectorizer

Using matplotlib backend: tkagg


In [107]:
df = pd.read_csv("spam.csv", encoding="latin1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [108]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [109]:
cleandf = df.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])
cleandf.columns=["spam or not", "message"]
cleandf["spam or not"] = cleandf["spam or not"].replace({"ham":0, "spam":1})
pd.set_option('future.no_silent_downcasting', True)
cleandf = cleandf.dropna();
cleandf.head()


Unnamed: 0,spam or not,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [110]:
print(cleandf.shape)

(5572, 2)


In [111]:
vectorizer = HashingVectorizer(n_features=1024)
hashed_features = vectorizer.transform(cleandf["message"])
hashed_features_dense = hashed_features.toarray()
print(hashed_features_dense.shape)

(5572, 1024)


In [112]:
cleandf.head(10)

Unnamed: 0,spam or not,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [113]:
x = hashed_features_dense
y = cleandf["spam or not"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


In [114]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [115]:
print(y_train.unique())
print(y_train.dtype)
print(y_test.unique())
print(y_test.dtype)


[1 0]
object
[0 1]
object


In [116]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)
print(y_train.dtype)
print(y_test.dtype)

int64
int64


In [117]:
clf = LogisticRegression(random_state=0, max_iter=200)
clf.fit(X_train_scaled, y_train)


In [118]:
y_pred = clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.97
