In [None]:
# SPAM DETECTION
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix 

In [None]:
#dataset is separated by tab, so we use seperator='\t'
import io
from google.colab import files
uploaded = files.upload()
data = pd.read_csv(io.BytesIO(uploaded['SMSSpamCollection']), sep='\t', names=['label', 'message'])

Saving SMSSpamCollection to SMSSpamCollection (3)


### Label: 
**Spam**: message is spam <br>
**ham**: message is not spam

In [None]:
# data = data[0:5]
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# information of the data Data frame object
data.info()
data.shape
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


**convert our labels to binary variables**, 0 to represent 'ham'(i.e. not spam) and 1 to represent 'spam' 

In [None]:
#use '1' for spam and '0' for not spam
# mapping label string to binary classifiers as 0 and 1



data['label'] = data.label.map({'ham':0, 'spam':1})
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# split into training and testing sets
# preprocessing step
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['message'], 
                                                    data['label'], 
                                                    test_size =0.2, 
                                                    random_state=0)


print('Number of rows in the total set: {}'.format(data.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4457
Number of rows in the test set: 1115


### Frequency distribution
Our objective here is to convert this set of text to a frequency distribution matrix

**Note**
- The CountVectorizer method automatically converts all tokenized words to their lower case form so that it does not treat words like 'He' and 'he' differently. It does this using the lowercase parameter which is by default set to True.

- It also ignores all punctuation so that words followed by a punctuation mark (for example: 'hello!') are not treated differently than the same words not prefixed or suffixed by a punctuation mark (for example: 'hello').

In [None]:
# to get the document matrix (NLP) or frequency counter matrix
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()



In [None]:

s = "12233455555"
count_dict = {char: 0 for char in s}
for char in s:
  # count_dict.get(char, 0)
  if char not in s:
    count_dict[char] = 1
  else:
    count_dict[char] += 1
print(count_dict)
  

{'1': 1, '2': 2, '3': 2, '4': 1, '5': 5}


#### fit_transform( )
Learn the vocabulary dictionary and return term-document matrix.
#### transform( )
Transform documents to document-term matrix.

In [None]:
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train).toarray()

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test).toarray()

training_data.shape
testing_data.shape

(1115, 7793)

In [None]:
# training data is 4 here - 4 entries

frequency_matrix = pd.DataFrame(training_data, 
                                columns = count_vector.get_feature_names())
frequency_matrix.head()


Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,0207,02072069400,02073162414,021,03,04,0430,05,050703,0578,06,07,07090201529,07090298926,07099833605,0721072,07734396839,07742676969,0776xxxxxxx,07781482378,07786200117,077xxx,078,07801543489,07808,07808247860,07808726822,07821230901,078498,07880867867,...,yoga,yogasana,yor,you,young,younger,your,youre,yourinclusive,yourjob,yours,yourself,youuuuu,yoville,yoyyooo,yr,yrs,ystrday,yummmm,yummy,yun,yunny,yuo,yuou,yup,yupz,zac,zaher,zealand,zebra,zed,zeros,zhong,zindgi,zoe,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#Train the data
clf = LogisticRegression(random_state=0)
clf.fit(training_data, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#predict the value
predictions = clf.predict(testing_data)

In [None]:
predictions

# F1 SCORE - 2*((precision*recall)/(precision+recall))

array([0, 1, 0, ..., 0, 1, 0])

### Result

In [None]:
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))
print('\nConfusion Matrix :\n', confusion_matrix(y_test, predictions)) 

Accuracy score:  0.9802690582959641
Precision score:  0.9791666666666666
Recall score:  0.88125
F1 score:  0.9276315789473684

Confusion Matrix :
 [[952   3]
 [ 19 141]]
