In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
#!/bin/bash
!kaggle datasets download hamzajabbarkhan/sms-data-labelled-spam-and-non-spam

Dataset URL: https://www.kaggle.com/datasets/hamzajabbarkhan/sms-data-labelled-spam-and-non-spam
License(s): unknown
Downloading sms-data-labelled-spam-and-non-spam.zip to /content
  0% 0.00/206k [00:00<?, ?B/s]
100% 206k/206k [00:00<00:00, 339MB/s]


In [7]:
!unzip /content/sms-data-labelled-spam-and-non-spam.zip

Archive:  /content/sms-data-labelled-spam-and-non-spam.zip
  inflating: SMSSpamCollection       


In [8]:
#dataset is separated by tab, so we use seperator='\t'
data = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

### Label:
**Spam**: message is spam <br>
**ham**: message is not spam

In [9]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


**convert our labels to binary variables**, 0 to represent 'ham'(i.e. not spam) and 1 to represent 'spam'

In [11]:
#use '1' for spam and '0' for not spam
data['label'] = data.label.map({'ham':0, 'spam':1})
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['message'],
                                                    data['label'],
                                                    test_size =0.2,
                                                    random_state=42)


print('Number of rows in the total set: {}'.format(data.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4457
Number of rows in the test set: 1115


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

In [25]:
training_data = count_vector.fit_transform(X_train).toarray()
testing_data = count_vector.transform(X_test).toarray()

In [26]:
frequency_matrix = pd.DataFrame(training_data,
                                columns = count_vector.get_feature_names_out())
frequency_matrix.head()

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,02,0207,...,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
testing_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [28]:
clf = LogisticRegression(random_state=0).fit(training_data, y_train)

In [29]:
#predict the value
predictions = clf.predict(testing_data)

In [30]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

### Result

In [31]:
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))
print('\nConfusion Matrix :\n', confusion_matrix(y_test, predictions))

Accuracy score:  0.9883408071748879
Precision score:  1.0
Recall score:  0.912751677852349
F1 score:  0.9543859649122807

Confusion Matrix :
 [[966   0]
 [ 13 136]]
