In [1]:
import pandas as pd

spam = pd.read_csv('C:/Users/rohit/Documents/Kaggle Dataset/spam/spam.csv', encoding='latin-1')
spam.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
spam.shape

(5572, 2)

In [2]:
#Using spacy==2.3.6
import spacy

In [3]:


#create an empty model
nlp = spacy.blank("en")

#Create the textCategorizer with exclusive classes and "bow" architecture
#classes are either ham or spam, hence we set 'exclusive_classes' to True
#architecture is set to 'Bag of words'
#Spacy provides a convolution neural network architecture as well, but it's more complex

textcat = nlp.create_pipe(
              "textcat",
              config={"exclusive_classes": True, "architecture": 'bow'})



In [4]:
print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))

pandas==1.0.1
spacy==2.3.5


In [5]:
nlp.add_pipe(textcat)

In [6]:
print(textcat)

<spacy.pipeline.pipes.TextCategorizer object at 0x000001B9BF528DC8>


In [7]:
#Add a new label to the pipe. Raises an error if the output dimension is already set, 
#or if the model has already been fully initialized. 
#Note that you don’t have to call this method if you provide a representative data sample to the initialize method. 
#In this case, all labels found in the sample will be automatically added to the model, 
##and the output dimension will be inferred automatically.
textcat.add_label('ham')
textcat.add_label('spam')

1

In [8]:
print(textcat)

<spacy.pipeline.pipes.TextCategorizer object at 0x000001B9BF528DC8>


In [9]:
#convert the labels in the data to the form TextCategorizer requires. 
#For each document, you'll create a dictionary of boolean values for each class.
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} 
                for label in spam['label']]

In [10]:
print(train_texts)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 ... 'Pity, * was in mood for that. So...any other suggestions?'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
 'Rofl. Its true to its name']


In [15]:
print(train_labels)

[{'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': False, 'spam': True}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}}, {'cats': {'ham': True, 'spam': False}},

In [11]:
#combine the texts and labels into a single list
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [12]:
#create an optimizer using nlp.begin_training(). 
#spaCy uses this optimizer to update the model. 
#In general it's more efficient to train models in small batches. 
#spaCy provides the minibatch function that returns a generator yielding minibatches for training. 
#Finally, the minibatches are split into texts and labels, then used with nlp.update to update the model's parameters.
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()
#print(optimizer)

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)

# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) but we need to
    # send separate lists for texts and labels to update().
    # This is a quick way to split a list of tuples into lists
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

In [20]:
# The above code is just one training loop (or epoch) through the data. 
# The model will typically need multiple epochs. Use another loop for more epochs, 
# and optionally re-shuffle the training data at the begining of each loop.

import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 0.43193237351289326}
{'textcat': 0.6475641522355602}
{'textcat': 0.7843301073471087}
{'textcat': 0.8716629028451455}
{'textcat': 0.927983800986226}
{'textcat': 0.9654547703572158}
{'textcat': 0.9938596482410351}
{'textcat': 1.0126842502717375}
{'textcat': 1.0274316938225831}
{'textcat': 1.0376987882580408}


In [21]:
# make predictions with the predict() method. 
# The input text needs to be tokenized with nlp.tokenizer. 
# Then you pass the tokens to the predict method which returns scores. 
# The scores are the probability the input text belongs to the classes.

texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[9.9994242e-01 5.7520185e-05]
 [1.1534694e-02 9.8846531e-01]]


In [22]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']


In [13]:
#load the large model to get the vectors
import numpy as np
import spacy

# Need to load the large model to get the vectors
nlp = spacy.load('en_core_web_lg')

In [14]:
# Disabling other pipes because we don't need them and it'll speed up this part a bit
text = "These vectors can be used as features for machine learning models."
with nlp.disable_pipes():
    vectors=np.array([token.vector for token in nlp(text)])

In [15]:
print(vectors)

[[-0.1965    -0.13995   -0.52495   ... -0.097467   0.34578   -0.14233  ]
 [-0.25205   -0.16047   -0.6089    ...  0.19218   -0.40028    0.51894  ]
 [-0.23857    0.35457   -0.30219   ... -0.35283    0.41888    0.13168  ]
 ...
 [ 0.047511   0.1404    -0.11736   ...  0.03169   -0.14208    0.42548  ]
 [ 0.0065037  0.2064     0.0089077 ...  0.033444  -0.030121  -0.12998  ]
 [ 0.012001   0.20751   -0.12578   ...  0.13871   -0.36049   -0.035    ]]


In [16]:
vectors.shape

(12, 300)

These are 300-dimensional vectors, with one vector for each word. However, we only have document-level labels and our models won't be able to use the word-level embeddings. So, you need a vector representation for the entire document.

There are many ways to combine all the word vectors into a single document vector we can use for model training. A simple and surprisingly effective approach is simply averaging the vectors for each word in the document. Then, you can use these document vectors for modeling.

spaCy calculates the average document vector which you can get with doc.vector. Here is an example loading the spam data and converting it to document vectors.

In [20]:
# Loading the spam data
# ham is the label for non-spam messages
with nlp.disable_pipes():
    doc_vectors = np.array([nlp(text).vector for text in spam.text])

In [21]:
doc_vectors.shape

(5572, 300)

Now let's train scikit-learn models, xgboost models, or any other standard approach to modeling.

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(doc_vectors, spam.label,
                                                    test_size=0.1, random_state=1)

In [23]:
from sklearn.svm import LinearSVC

# Set dual=False to speed up training, and it's not needed
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc.fit(X_train, y_train)
print(f"Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )

Accuracy: 97.312%


Documents with similar content generally have similar vectors. So you can find similar documents by measuring the similarity between the vectors. A common metric for this is the cosine similarity which measures the angle between two vectors,  a  and  b .

cosθ=a⋅b∥a∥∥b∥
 
This is the dot product of  a  and  b , divided by the magnitudes of each vector. The cosine similarity can vary between -1 and 1, corresponding complete opposite to perfect similarity, respectively. To calculate it, you can use the metric from scikit-learn or write your own function.



In [24]:
def cosine_similarity(a,b):
    return a.dot(b)/np.sqrt(a.dot(a)*b.dot(b))

In [25]:
a = nlp("REPLY NOW FOR FREE TEA").vector
b = nlp("According to legend, Emperor Shen Nung discovered tea when leaves from a wild tree blew into his pot of boiling water.").vector
cosine_similarity(a, b)

0.7030031