Let us look at how we can perfom text classification with spaCy
The dataset is from the Tweet Sentiment Extraction challenge from Kaggle(https://www.kaggle.com/c/tweet-sentiment-extraction/overview)
We would perform text classification using spaCy on tweet data to classify tweets as "positive","negative"  or "neutral"



In [1]:
#Import all required libraries
import spacy
import random
import time
import numpy as np
import pandas as pd
import re
import string

from spacy.util import minibatch, compounding
import sys
from spacy import displacy
from itertools import chain

from sklearn.metrics import classification_report

Let us define methods to pre-process the tweets

In [2]:

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_url(text): 
    url_pattern  = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)
 # converting return value from list to string



def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    
    return text2.lower()



We need to convert our training data to spaCy format to train the categorization model. 
<br>
The train data format looks like:
<br>
[
{'some text', {'cats': {'Class_Label1': 1, 'Class_Label12': 1, ...'Class_Labeln': 0},
{'some text', {'cats': {'Class_Label1': 0, 'Class_Label12': 1, ...'Class_Labeln': 0},
{ ...}]
<br>


In [3]:
def load_data_spacy(file_path):
  
    train_data = pd.read_csv(file_path)
    train_data.dropna(axis = 0, how ='any',inplace=True) 
    train_data['Num_words_text'] = train_data['text'].apply(lambda x:len(str(x).split())) 
    mask = train_data['Num_words_text'] >2
    train_data = train_data[mask]
    print(train_data['sentiment'].value_counts())
    
    train_data['text'] = train_data['text'].apply(remove_emoji)
    train_data['text'] = train_data['text'].apply(remove_url)
    train_data['text'] = train_data['text'].apply(clean_text)
   
    train_texts = train_data['text'].tolist()
    train_cats = train_data['sentiment'].tolist()
    final_train_cats=[]
    for cat in train_cats:
        cat_list = {}
        if cat == 'positive':
            cat_list['positive'] =  1
            cat_list['negative'] =  0
            cat_list['neutral'] =  0
        elif cat == 'negative':
            cat_list['positive'] =  0
            cat_list['negative'] =  1
            cat_list['neutral'] =  0
        else:
            cat_list['positive'] =  0
            cat_list['negative'] =  0
            cat_list['neutral'] =  1
        final_train_cats.append(cat_list)
    
    training_data = list(zip(train_texts, [{"cats": cats} for cats in final_train_cats]))
    return training_data,train_texts,train_cats 

Let us convert our train data and test data to spaCy format

In [4]:
training_data,train_texts,train_cats   = load_data_spacy("C:\\TweetSenitment\\train.csv")
print(training_data[:10])
print(len(training_data))
test_data,test_texts,test_cats   = load_data_spacy("C:\\TweetSenitment\\test.csv")
print(len(test_data))



neutral     10704
positive     8375
negative     7673
Name: sentiment, dtype: int64
[('have responded were going', {'cats': {'positive': 0, 'negative': 0, 'neutral': 1}}), ('sooo will miss here diego', {'cats': {'positive': 0, 'negative': 1, 'neutral': 0}}), ('boss bullying', {'cats': {'positive': 0, 'negative': 1, 'neutral': 0}}), ('what interview leave alone', {'cats': {'positive': 0, 'negative': 1, 'neutral': 0}}), ('sons couldnt they them releases already bought', {'cats': {'positive': 0, 'negative': 1, 'neutral': 0}}), ('some shameless plugging best rangers forum earth', {'cats': {'positive': 0, 'negative': 0, 'neutral': 1}}), ('feedings baby when smiles coos', {'cats': {'positive': 1, 'negative': 0, 'neutral': 0}}), ('both', {'cats': {'positive': 0, 'negative': 0, 'neutral': 1}}), ('journey just became cooler hehe that possible', {'cats': {'positive': 1, 'negative': 0, 'neutral': 0}}), ('much love hopeful reckon chances minimal never gonna cake stuff', {'cats': {'positive': 0, 'n

Let us define a method to evaluate our text categorization model. I use classification_report from sci-kit learn to get the evaluation metrics

In [5]:
def Sort(sub_li): 
  
    # reverse = True (Soresulting_list = list(first_list)rts in Descending  order) 
    # key is set to sort using second element of  
    # sublist lambda has been used 
    return(sorted(sub_li, key = lambda x: x[1],reverse=True))  

# run the predictions on each sentence in the evaluation  dataset, and return the metrics
def evaluate(tokenizer, textcat, test_texts, test_cats ):
    docs = (tokenizer(text) for text in test_texts)
    preds = []
    for i, doc in enumerate(textcat.pipe(docs)):
        #print(doc.cats.items())
        scores = Sort(doc.cats.items())
        #print(scores)
        catList=[]
        for score in scores:
            catList.append(score[0])
        preds.append(catList[0])
        
    labels = ['positive', 'negative','neutral']
    
    print(classification_report(test_cats,preds,labels=labels))

    

***

Now let us train a text classification/categorization model in spaCy for classifying sentiment of  tweets.
We use an existing model "en_core_web_md"( English medium sized model).
This model by deafult has POS tagger, Dependency parser and Named entity recognition functionalities

***
***

We only re-train the text categorization  part of the model.
<br>
We have the following model architecture  available for training:

<br>
* "bow"	An ngram “bag-of-words” model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. The features extracted can be controlled using the keyword arguments ngram_size and attr. For instance, ngram_size=3 and attr="lower" would give lower-cased unigram, trigram and bigram features. 2, 3 or 4 are usually good choices of ngram size.
<br>
* "simple_cnn"	A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster.

<br>
* "ensemble"	Default: Stacked ensemble of a bag-of-words model and a neural network model. The neural network uses a CNN with mean pooling and attention. The “ngram_size” and “attr” arguments can be used to configure the feature extraction for the bag-of-words model.

***
Dropout is a regularization technique for reducing overfitting in neural networks by preventing complex co-adaptations on training data. ... The term dropout refers to randomly "dropping out", or omitting, units (both hidden and visible) during the training process of a neural network.
In our case if dropout = 0.5 there is a 50% dropping out otmitting units during training process of our model

In [7]:
def train_spacy(  train_data, iterations,test_texts,test_cats, model_arch, dropout = 0.3, model=None,init_tok2vec=None):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {cats: ['positive'|'negative'|'neutral']})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    
    nlp = spacy.load("en_core_web_md")
    

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": model_arch}
        )
        nlp.add_pipe(textcat, last=True)
        
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("positive")
    textcat.add_label("negative")
    textcat.add_label("neutral")


    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(16.0, 64.0, 1.5)
        for i in range(iterations):
            print('Iteration: '+str(i))
            start_time = time.clock()
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the test data 
                evaluate(nlp.tokenizer, textcat, test_texts,test_cats)
            print ('Elapsed time'+str(time.clock() - start_time)+  "seconds")
        with nlp.use_params(optimizer.averages):
            modelName = model_arch+"TweetClassification"
            filepath = "C:\\TweetSenitment\\"+modelName+"\\"
            nlp.to_disk(filepath)
    return nlp
    

Lets train the model on our dataset

In [10]:
# Train (and save) the Text categorization model with BOW
nlp = train_spacy(training_data, 10,test_texts,test_cats,"bow")





Training the model...
LOSS 	  P  	  R  	  F  
Iteration: 0
              precision    recall  f1-score   support

    positive       0.78      0.60      0.68      1075
    negative       0.75      0.39      0.51       983
     neutral       0.54      0.82      0.65      1376

    accuracy                           0.63      3434
   macro avg       0.69      0.60      0.61      3434
weighted avg       0.68      0.63      0.62      3434

Elapsed time11.016886299999896seconds
Iteration: 1
              precision    recall  f1-score   support

    positive       0.78      0.63      0.70      1075
    negative       0.73      0.46      0.56       983
     neutral       0.56      0.79      0.66      1376

    accuracy                           0.65      3434
   macro avg       0.69      0.63      0.64      3434
weighted avg       0.68      0.65      0.64      3434

Elapsed time10.777873699999873seconds
Iteration: 2
              precision    recall  f1-score   support

    positive       0.7

In [30]:
# Train (and save) the Text categorization model with Simple CNN
nlp = train_spacy(training_data, 10,test_texts,test_cats,"simple_cnn")


Training the model...
LOSS 	  P  	  R  	  F  
Iteration: 0
              precision    recall  f1-score   support

    positive       0.77      0.73      0.75      1075
    negative       0.70      0.66      0.68       983
     neutral       0.65      0.71      0.68      1376

    accuracy                           0.70      3434
   macro avg       0.71      0.70      0.70      3434
weighted avg       0.71      0.70      0.70      3434

Elapsed time50.510161600000174seconds
Iteration: 1
              precision    recall  f1-score   support

    positive       0.76      0.75      0.76      1075
    negative       0.71      0.68      0.70       983
     neutral       0.67      0.69      0.68      1376

    accuracy                           0.71      3434
   macro avg       0.71      0.71      0.71      3434
weighted avg       0.71      0.71      0.71      3434

Elapsed time35.800802300000214seconds
Iteration: 2
              precision    recall  f1-score   support

    positive       0.7

In [8]:
# Train (and save) the Text categorization model with ensemble
nlp = train_spacy(training_data, 10,test_texts,test_cats,"ensemble")


eneterd here
Training the model...
LOSS 	  P  	  R  	  F  
Iteration: 0
              precision    recall  f1-score   support

    positive       0.75      0.76      0.76      1075
    negative       0.69      0.67      0.68       983
     neutral       0.67      0.68      0.68      1376

    accuracy                           0.70      3434
   macro avg       0.70      0.70      0.70      3434
weighted avg       0.70      0.70      0.70      3434

Elapsed time29.4053493seconds
Iteration: 1
              precision    recall  f1-score   support

    positive       0.75      0.77      0.76      1075
    negative       0.71      0.69      0.70       983
     neutral       0.68      0.68      0.68      1376

    accuracy                           0.71      3434
   macro avg       0.71      0.71      0.71      3434
weighted avg       0.71      0.71      0.71      3434

Elapsed time32.7850301seconds
Iteration: 2
              precision    recall  f1-score   support

    positive       0.76  

Lets test our model on  test data

In [37]:
nlp2 = spacy.load("C:\\TweetSenitment\\bowTweetClassification\\")
doc2 = nlp2(test_texts[100])
print("Text: "+ test_texts[100])
print("Orig Cat:"+ test_cats[100])
print(" Predicted Cats:") 
print(doc2.cats)
print("=======================================")
doc2 = nlp2(test_texts[1000])
print("Text: "+ test_texts[1000])
print(" Orig Cat:"+test_cats[1000])
print(" Predicted Cats:") 
print(doc2.cats)

Text: want david cook
Orig Cat:positive
 Predicted Cats:
{'positive': 0.21973687410354614, 'negative': 0.22979359328746796, 'neutral': 0.5504695177078247}
Text: okaii cool cant wait series begin guna awesome
 Orig Cat:positive
 Predicted Cats:
{'positive': 0.9604972004890442, 'negative': 0.0052943420596420765, 'neutral': 0.03420846536755562}


In [39]:
nlp2 = spacy.load("C:\\TweetSenitment\\simple_cnnTweetClassification\\")
doc2 = nlp2(test_texts[100])
print("Text: "+ test_texts[100])
print("Orig Cat:"+ test_cats[100])
print(" Predicted Cats:") 
print(doc2.cats)
print("=======================================")
doc2 = nlp2(test_texts[1000])
print("Text: "+ test_texts[1000])
print(" Orig Cat:"+test_cats[1000])
print(" Predicted Cats:") 
print(doc2.cats)

Text: want david cook
Orig Cat:positive
 Predicted Cats:
{'positive': 0.007848287932574749, 'negative': 0.012497424148023129, 'neutral': 0.9796542525291443}
Text: okaii cool cant wait series begin guna awesome
 Orig Cat:positive
 Predicted Cats:
{'positive': 0.9424135684967041, 'negative': 0.002971380716189742, 'neutral': 0.05461500957608223}


In [12]:
nlp2 = spacy.load("C:\\TweetSenitment\\ensembleTweetClassification\\")
doc2 = nlp2(test_texts[100])
print("Text: "+ test_texts[100])
print("Orig Cat:"+ test_cats[100])
print(" Predicted Cats:") 
print(doc2.cats)
print("=======================================")
doc2 = nlp2(test_texts[1000])
print("Text: "+ test_texts[1000])
print(" Orig Cat:"+test_cats[1000])
print(" Predicted Cats:") 
print(doc2.cats)

Text: want david cook
Orig Cat:positive
 Predicted Cats:
{'positive': 0.05605944246053696, 'negative': 0.03033527359366417, 'neutral': 0.9136053323745728}
Text: okaii cool cant wait series begin guna awesome
 Orig Cat:positive
 Predicted Cats:
{'positive': 0.9636390209197998, 'negative': 0.002848424483090639, 'neutral': 0.033512573689222336}
