In [3]:
#Import all required libraries
import spacy
import random
import time
import numpy as np
import pandas as pd
import re
import string


import sys
from spacy import displacy

from tqdm.auto import tqdm
from spacy.tokens import DocBin

In [2]:
#TEXT PROCESSING FUNCTION

def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    
    return text2.lower()

In [3]:
#MAKE DOCS TO FEED IT TO THE MODEL


def make_docs(file_path):
    """
    this will take a list of texts and labels 
    and transform them in spacy documents
    
    data: list(tuple(text, label))
    
    returns: List(spacy.Doc.doc)
    """
    train_data = pd.read_csv(file_path)
    train_data.dropna(axis = 0, how ='any',inplace=True) 
    train_data['Num_words_text'] = train_data['news'].apply(lambda x:len(str(x).split())) 
    mask = train_data['Num_words_text'] >2
    train_data = train_data[mask]
    print(train_data['label'].value_counts())
    
    
    train_data['news'] = train_data['news'].apply(clean_text)
   
    data = tuple(zip(train_data['news'].tolist(), train_data['label'].tolist())) 
    print(data[1])
    docs = []
    # nlp.pipe([texts]) is way faster than running 
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple, 
    # the first one is treated as text
    # the second one will get returned as it is.
    nlp = spacy.load("en_core_web_trf")
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        
        # we need to set the (text)cat(egory) for each document
        #print(label)
        if (label=='sports'):
            doc.cats['sports'] = 1
            doc.cats['tech'] = 0
            
            doc.cats['general'] = 0
        
        
        
        elif (label=='general'):
            doc.cats['sports'] = 0
            doc.cats['tech'] = 0
            
            doc.cats['general'] = 1
        
        else:
            doc.cats['sports'] = 0
            doc.cats['tech'] = 1
            
            doc.cats['general'] = 0
        #print(doc.cats)
        
        # put them into a nice list
        docs.append(doc)
    
    return docs,train_data

In [4]:
##LOAD TRAIN/TEST DATA

train_docs,train_data  = make_docs("train.csv") #path to train data
# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("textcat_train.spacy")

test_docs,test_data  = make_docs("test.csv") #path to test data
# then we save it in a binary file to disc
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("textcat_valid.spacy")

tech       757
general    689
sports     669
Name: label, dtype: int64
('infrastructures able interrupt botnet activity warns that temporarily once glupteba uses blockchain technology mechanism avoid comp', 'tech')


  0%|          | 0/2115 [00:00<?, ?it/s]



tech       193
sports     176
general    160
Name: label, dtype: int64
('real american leaders start from today until next vodafone invites your customers choose christmas offer that want have access just through vodafone selected', 'tech')


  0%|          | 0/529 [00:00<?, ?it/s]

In [5]:
##WE NEED TO HAVE THE TEXTCAT_BASE_CONFIG FILE  (EITHER STICK WITH MINE OR GET A NEW ONE FROM SPACY WEBSITE)


!python -m spacy init fill-config ./textcat_base_config.cfg ./textcat_config.cfg

[+] Auto-filled config with all values
[+] Saved config
textcat_config.cfg
You can now add your data and train your pipeline:
python -m spacy train textcat_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [6]:
!python -m spacy train textcat_config.cfg --verbose --output ./textcat_output --paths.train textcat_train.spacy --paths.dev textcat_valid.spacy

[+] Created output directory: textcat_output
[i] Saving to output directory: textcat_output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['transformer', 'textcat']
[i] Initial learn rate: 0.0
E    #       LOSS TRANS...  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  -------------  ------------  ----------  ------
  0       0           0.00          0.01        0.00    0.00
  8     200           0.01         21.72       84.82    0.85
 17     400           0.07          1.37       87.12    0.87
 26     600           0.00          0.02       85.90    0.86
 35     800           0.00          0.00       86.15    0.86
 43    1000           0.00          0.00       85.18    0.85
 52    1200           0.00          0.00       87.35    0.87
 61    1400           0.00          0.00       87.21    0.87
 69    1600           0.00          0.00       86.07    0.86
 78    1800           0.00          0.00       86.49    0.86
 87    2000           0.00          0.00       87.46    0.

[2021-12-13 21:43:13,714] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2021-12-13 21:43:14,222] [INFO] Set up nlp object from config
[2021-12-13 21:43:14,230] [DEBUG] Loading corpus from path: textcat_valid.spacy
[2021-12-13 21:43:14,231] [DEBUG] Loading corpus from path: textcat_train.spacy
[2021-12-13 21:43:14,231] [INFO] Pipeline: ['transformer', 'textcat']
[2021-12-13 21:43:14,235] [INFO] Created vocabulary
[2021-12-13 21:43:14,235] [INFO] Finished initializing nlp object
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected i

In [5]:
#LOAD AND TEST THE MODEL


nlp_textcat = spacy.load("textcat_output/model-best")
test_texts = test_data['news'].tolist()
test_cats = test_data['label'].tolist()
doc2 = nlp_textcat(test_texts[1])
print("Text: "+ test_texts[1])
print("Orig Cat:"+ test_cats[1])
print(" Predicted Cats:") 
print(doc2.cats)
print("=======================================")
doc2 = nlp_textcat(test_texts[200])
print("Text: "+ test_texts[200])
print(" Orig Cat:"+test_cats[200])
print(" Predicted Cats:") 
print(doc2.cats)

NameError: name 'test_data' is not defined