### <p style="background-color:#34bdeb;font-family:newtimeroman;color:#e3f8ff;font-size:150%; font-syle:bold;text-align:center;border-radius:20px 60px;">Spam or Ham Classifier</p>
![spam and ham classification using spacy](https://lionbridge.ai/wp-content/uploads/2020/08/2020-08-20_nlp_spam-detection.jpg)


### **There are numbers of way to build email classifier but in this notebook we will focus on how to create simple email classifier using one of the powerful  NLP libray SpaCy**

#### **Why Spacy ?**
* ##### **SpaCy is an open-source natural language processing library for Python. It is designed particularly for production use, and it can help us to build applications that process massive volumes of text efficiently.**
* ##### **Another advantage of SpaCy is we can perform the various text manipulation operation very well with optimzed and minimal code**
* ##### **Also you can create customized pipeline with different steps using SpaCy**

#### Want to learn more about SpaCy?
[Click here to check the document](https://spacy.io/)

# Import Libraries

In [None]:
import re
import nltk
import random
import pandas as pd
import numpy as np
import seaborn as sns
import spacy
from spacy.util import minibatch
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline    
from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn.metrics import precision_score, recall_score, plot_confusion_matrix, classification_report, accuracy_score, f1_score, confusion_matrix
from sklearn import metrics
%matplotlib inline

In [None]:
sns.set_style('whitegrid')

# Data Load
[Click here to download  the dataset](https://www.kaggle.com/uciml/sms-spam-collection-dataset)

In [None]:
data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv')

In [None]:
# removing unused column and renaming columns based on requirements
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1).rename(columns={'v1': 'target', 'v2': 'text'})

In [None]:
data.head()

In [None]:
data.target.value_counts(normalize=True)*100

# Class distribution

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
f = sns.countplot(x = data['target'], palette="Blues_d")
plt.xlabel('Target Variable')
plt.ylabel('Counts of each class')
plt.title('Class distribution (%)')
for p in f.patches:
    width = p.get_width()
    height = p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{round(height/data.shape[0], 2)*100} %', (x + width/2, y + height*1.01), ha='center')

<p style="background-color:#34bdeb;font-family:newtimeroman;color:#e3f8ff;font-size:150%; font-syle:bold;text-align:center;">Create SpaCy text-categorization pipeline and model</p>

In [None]:
# create empty model
nlp = spacy.blank("en")

text_cls = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "bow"})

# add pipeline in model we can add other steps in pipeline also but for now i am not adding tokenization, lemmetization, stop word removation etc. steps
nlp.add_pipe(text_cls)

# add your customer label in pipeline
text_cls.add_label('ham')
text_cls.add_label('spam')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data['text'], data['target'], test_size=0.3, random_state = 7)

In [None]:
# Create the train and test data for the spacy model
train_lables = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}}  for label in y_train]
test_lables = [{'cats': {'ham': label == 'ham',
                      'spam': label == 'spam'}}  for label in y_test]

# Spacy model data
train_data = list(zip(x_train, train_lables))
test_data = list(zip(x_test, test_lables))

In [None]:
def train_model(model, train_data, optimizer, batch_size, epochs=10):
    losses = {}
    random.seed(1)

    for epoch in range(epochs):
        random.shuffle(train_data)

        batches = minibatch(train_data, size=batch_size)
        for batch in batches:
            # Split batch into texts and labels
            texts, labels = zip(*batch)

            # Update model with texts and labels
            model.update(texts, labels, sgd=optimizer, losses=losses)
        print("Loss: {}".format(losses['textcat']))

    return losses['textcat']

In [None]:
optimizer = nlp.begin_training()
batch_size = 5
epochs = 20

# Training the model
train_model(nlp, train_data, optimizer, batch_size, epochs)

In [None]:
def get_predictions(model, texts):
    # Use the model's tokenizer to tokenize each input text
    docs = [model.tokenizer(text) for text in texts]

    # Use textcat to get the scores for each doc
    text_cls = model.get_pipe('textcat')
    scores, _ = text_cls.predict(docs)

    # From the scores, find the label with the highest score/probability
    predicted_labels = scores.argmax(axis=1)
    predicted_class = [text_cls.labels[label] for label in predicted_labels]

    return predicted_class

In [None]:
train_predictions = get_predictions(nlp, x_train)
test_predictions = get_predictions(nlp, x_test)
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print("Train accuracy: {}".format(train_accuracy))
print("Test accuracy: {}".format(test_accuracy))

In [None]:
cf_train_matrix = confusion_matrix(y_train, train_predictions)
plt.figure(figsize=(10,8))
sns.heatmap(cf_train_matrix, annot=True, fmt='d')

cf_test_matrix = confusion_matrix(y_test, test_predictions)
plt.figure(figsize=(10,8))
sns.heatmap(cf_test_matrix, annot=True, fmt='d')

<p style="color:#42c5f5;font-size:150%; font-weight:bold; text-align:left;">If you found this notebook useful, please do upvote.</p>
<p style="color:#42c5f5;font-size:150%; font-weight:bold; text-align:left;">If you have any suggestions or questions, feel free to comment!</p>
<p style="color:#42c5f5;font-size:150%; font-weight:bold; text-align:left;">Thanks Happy Learning !</p>