# Text Message Classifier

In [237]:
import pandas as pd
import numpy as np

In [181]:
# Loading data 
raw_data  = pd.read_csv(r"C:\Users\prashantha.v\OneDrive - SLK Software Pvt Ltd\Documents\Assignment_folders\NLP_Assignment\SPAM_text_message_Full_Data.csv")
raw_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [182]:
# Copying data to a new df
df = raw_data.copy()

In [183]:
# General info of the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [184]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [185]:
# Frequency of the classes
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [186]:
df.sample(5)

Unnamed: 0,Category,Message
3940,ham,Y ü wan to go there? C doctor?
2875,ham,Crucify is c not s. You should have told me ea...
5141,spam,FREE for 1st week! No1 Nokia tone 4 ur mobile ...
2165,ham,"Nothing really, just making sure everybody's u..."
4924,ham,Oh yah... We never cancel leh... Haha


In [187]:
## Vectorizing training text data (i.e documents)
# Using pre-trained word2vec vectorizer
import gensim.downloader as api
word2vec = api.load('word2vec-google-news-300')

In [188]:
# Total words in the vectorizer model
len(word2vec.index_to_key)

3000000

In [189]:
# Number of dimensions in the vectorizer
word2vec.vector_size

300

In [190]:
# sample word-vectors (300D)
sample_word = 'ocean'
sample_word_vec = word2vec[sample_word]
sample_word_vec

array([-0.0859375 ,  0.0859375 , -0.11425781, -0.02062988, -0.05859375,
       -0.12402344, -0.20117188, -0.01318359,  0.20507812,  0.31445312,
       -0.02709961, -0.33007812, -0.13964844, -0.359375  , -0.13769531,
        0.00491333,  0.00564575,  0.05761719, -0.00367737,  0.2734375 ,
       -0.06835938,  0.02478027, -0.08544922, -0.22558594,  0.15039062,
       -0.31835938, -0.01611328,  0.24414062,  0.15136719, -0.3046875 ,
       -0.33984375,  0.17871094, -0.22753906, -0.02416992, -0.06030273,
        0.04345703, -0.05883789, -0.09960938, -0.11572266,  0.078125  ,
       -0.2109375 , -0.00234985,  0.0189209 ,  0.16015625, -0.30664062,
       -0.17871094,  0.00927734,  0.01275635,  0.08154297, -0.03881836,
        0.21679688,  0.19726562,  0.1640625 , -0.23242188, -0.04516602,
        0.25195312,  0.08398438,  0.01721191,  0.33789062, -0.10351562,
       -0.13476562, -0.12695312,  0.12890625, -0.078125  ,  0.19140625,
        0.13769531, -0.29101562,  0.01483154, -0.04907227, -0.12

In [191]:
# checking similarity between two words
w1, w2 = 'electron', 'proton'
print(word2vec.similarity(w1, w2))

0.68709683


In [192]:
# Printing similar words (nearby words in 300D space)
print(word2vec.most_similar(positive=['school'], topn=5))

[('elementary', 0.7868632078170776), ('schools', 0.7411909103393555), ('shool', 0.6692329049110413), ('elementary_schools', 0.6597153544425964), ('kindergarten', 0.6529810428619385)]


In [193]:
# Experimenting with words
word = ''
if word in word2vec.index_to_key:
    print("it's there")
else:
    print("it's not there")

it's not there


In [194]:
# function to vectorize a document
def document_vector (doc, vectorizer):
    l2 = []
    for token in doc.split():
        if token in vectorizer.index_to_key:
            l2.append(word2vec[token])
    doc_vector = np.mean(l2, axis=0)
    if doc_vector.shape != (vectorizer.vector_size, ):
        return np.zeros(vectorizer.vector_size)
    return doc_vector

In [195]:
# Vextorizing the input documents
doc_vct_list = []
for doc in df.Message:
    doc_vct_list.append(document_vector(doc, word2vec))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [200]:
# Model selection and training
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [202]:
M = np.array(doc_vct_list)
n = df.Category

In [203]:
M.shape

(5572, 300)

In [204]:
# Splitting the dataset for training and testing
M_train, M_test, n_train, n_test = train_test_split(M, n, test_size=0.2)

In [205]:
len(M)

5572

In [206]:
classifier_w2v = LogisticRegression()
classifier_w2v.fit(M_train, n_train)

In [207]:
# Prediction on test data
n_pred = classifier_w2v.predict(M_test)

In [208]:
# Model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [209]:
# Metrics calculations for word2vec vectorizer
accuracy1 = accuracy_score(n_test, n_pred)
precision1 = precision_score(n_test, n_pred, average='weighted')
recall1 = recall_score(n_test, n_pred, average='weighted')
f11 = f1_score(n_test, n_pred, average='weighted')

print(f'Accuracy: {accuracy1}, \nPrecision: {precision1}, \nRecall: {recall1}, \nF1 Score: {f11}')

Accuracy: 0.9614349775784753, 
Precision: 0.9606566591315402, 
Recall: 0.9614349775784753, 
F1 Score: 0.9599913764144531


In [248]:
l3 = [""]
l3_doc_vct = []
for example in l3:
    l3_doc_vct.append(document_vector(example, word2vec))

l3_array = np.array(l3_doc_vct)
l3_pred = classifier_w2v.predict(l3_array)
l3_pred

array(['ham'], dtype=object)

array(['ham', 'spam'], dtype=object)

###############################################################################
###############################################################################

Now, we will use Bag of Words as a vectorizer

In [210]:
# Vectorizing training text data (i.e documents)
from sklearn.feature_extraction.text import CountVectorizer

In [211]:
# using bag-of-words model
vectorizer1 = CountVectorizer(min_df=1, ngram_range=(1,1))
X = vectorizer1.fit_transform(df.Message)
y = df.Category

In [212]:
y[0]

'ham'

In [213]:
# All the vocabulary from the vectorizer
vectorizer1.get_feature_names_out()

array(['00', '000', '000pes', ..., 'èn', 'ú1', '〨ud'], dtype=object)

In [214]:
X[0]

<1x8709 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [215]:
X.toarray().shape

(5572, 8709)

In [216]:
print(len(vectorizer1.get_feature_names_out()))

8709


In [217]:
print(len(vectorizer1.vocabulary_))

8709


In [218]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [219]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [220]:
# Prediction on test data
y_pred = classifier.predict(X_test)

In [249]:
new_text = ['Free coupons. Click below']
new_text_vector = vectorizer1.transform(new_text)
classifier.predict(new_text_vector)

array(['ham'], dtype=object)

In [222]:
# Metrics calculations for countvectorizer
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}, \nPrecision: {precision}, \nRecall: {recall}, \nF1 Score: {f1}')

Accuracy: 0.97847533632287, 
Precision: 0.9784371248813666, 
Recall: 0.97847533632287, 
F1 Score: 0.9778620618857253


Since there is a class imbalance in the dataset, we have to take care of it...!!!!

In [223]:
# Same procedure repeated with balanced dataset
df_ham = df[df.Category == 'ham']
df_ham.shape

(4825, 2)

In [224]:
df_spam = df[df.Category == 'spam']
df_spam.shape

(747, 2)

In [225]:
df_balanced = pd.concat([df_spam, df_ham.sample(n=747, random_state=30)])
df_balanced.sample(10)

Unnamed: 0,Category,Message
4628,spam,Please call our customer service representativ...
3411,ham,Joy's father is John. Then John is the ____ of...
5278,spam,URGENT! Your Mobile number has been awarded wi...
1777,spam,Call FREEPHONE 0800 542 0578 now!
2675,ham,I am 6 ft. We will be a good combination!
5423,ham,"Sorry, I'll call later"
5115,spam,"Get 3 Lions England tone, reply lionm 4 mono o..."
1073,spam,Dear U've been invited to XCHAT. This is our f...
4562,ham,Good afternoon my boytoy. How goes that walkin...
535,ham,I've not called you in a while. This is hoping...


In [226]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1494 entries, 2 to 4715
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  1494 non-null   object
 1   Message   1494 non-null   object
dtypes: object(2)
memory usage: 35.0+ KB


In [227]:
vectorizer2 = CountVectorizer(min_df=2, ngram_range=(1, 2))
X2 = vectorizer2.fit_transform(df_balanced.Message)
y2 = df_balanced.Category

In [228]:
X2

<1494x5956 sparse matrix of type '<class 'numpy.int64'>'
	with 36950 stored elements in Compressed Sparse Row format>

In [229]:
X2.toarray().shape

(1494, 5956)

In [230]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2)

In [231]:
X2_train.shape, X2_test.shape

((1195, 5956), (299, 5956))

In [232]:
classifier2 = LogisticRegression()
classifier2.fit(X2_train, y2_train)

In [233]:
y2_pred = classifier2.predict(X2_test)

In [234]:
accuracy2 = accuracy_score(y2_test, y2_pred)
precision2 = precision_score(y2_test, y2_pred, average='weighted')
recall2 = recall_score(y2_test, y2_pred, average='weighted')
f1_2 = f1_score(y2_test, y2_pred, average='weighted')

print(f'Accuracy: {accuracy2}, \nPrecision: {precision2}, \nRecall: {recall2}, \nF1 Score: {f1_2}')

Accuracy: 0.9565217391304348, 
Precision: 0.9581741474276341, 
Recall: 0.9565217391304348, 
F1 Score: 0.9564779205452202


In [235]:
new_text = ['mobile with a FREE sexy pic of Jorda']
new_text_vector = vectorizer2.transform(new_text)
classifier2.predict(new_text_vector)

array(['spam'], dtype=object)