# This jupyter notebook contains two different libraries for Spacy and NLTK and #tensorflow.keras.preprocessing.text and #torchtext

## First lets recap regex library and some of the important functions

In [18]:
import re

"""
Major RE functions

re.findall - Module is used to search for “all” occurrences that match a given pattern.


re.sub - Substitute the matched RE patter with given text


re.match - The match function is used to match the RE pattern to string with optional flags


re.search - This method takes a regular expression pattern and a string and searches for that pattern with the string.

"""

In [47]:
text="""Original Article: In computer science, lexical analysis, lexing or 
    tokenization is the process of converting a sequence of characters 
    (such as in a computer program or web page) into a sequence of tokens 
    (strings with an assigned and thus identified meaning). 
    A program that performs lexical analysis may be termed a lexer, 
    tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer.
    A lexer is generally combined with a parser, which together analyze the syntax of programming languages,
    web pages, and so forth."""


In [21]:
re.findall(r'([:?\.]+)',text)

[':', '.', '.', '.']

# 1) SpaCy

In [62]:
import spacy
import en_core_web_md
nlp= en_core_web_md.load()

## 1.1) Stop words in Spacy

In [63]:
stop_words= nlp.Defaults.stop_words
len(stop_words)

326

## 1.2) Sentence and Word Tokenize with spacy

In [30]:
doc= nlp(text)
sent_generator= doc.sents
bagOfwords=[]
for i,sent in enumerate(sent_generator):
    # sent now is a span
    print('sentence : {}'.format(i))
    print(sent.text)
    for word in sent:
        bagOfwords.append(word.text)
        # word now is a token
        # word.text create a string
#         print("the token is : {}".format(word))
#         print("the string of token is : {}".format(word.text))
#         print("the lemma of token is : {} ".format(word.lemma_))
#         print("the position of token is : {} ".format(word.pos_))
        
        
        
    

sentence : 0
Original Article:
sentence : 1
In computer science, lexical analysis, lexing or 
    tokenization is the process of converting a sequence of characters 
    (such as in a computer program or web page) into a sequence of tokens 
    
sentence : 2
(strings with an assigned and thus identified meaning). 
    
sentence : 3
A program that performs lexical analysis may be termed a lexer, 
    tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer.
    
sentence : 4
A lexer is generally combined with a parser, which together analyze the syntax of programming languages,
    web pages, and so forth.


## 1.3) Lemmatization and Stemming in SpaCy

In [79]:
### Note spaCy do not have stemming. Due to the reason that Lemmatization is seen as more informative than stemming.
test="have has had"
doc= nlp(test)
print([token.lemma_ for token in doc])
# vs how we nltk
print([nltk.stem.WordNetLemmatizer().lemmatize(token,pos='v') for token in nltk.tokenize.word_tokenize(test)])
print([nltk.stem.PorterStemmer().stem(token) for token in nltk.tokenize.word_tokenize(test)])

['have', 'have', 'have']
['have', 'have', 'have']
['have', 'ha', 'had']


# 2) NLTK

In [72]:
import nltk

## 2.1) Stop words in nltk

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words= stopwords.words('english')
len(stop_words)

## 2.2) Tokenize with nltk

In [48]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

## or we can also directly call sent_tokenize and word_tokenize as below

# from nltk import sent_tokenize
# from nltk import word_tokenize


In [49]:
sentences= sent_tokenize(text)
bagOfwords=[]
for i,sent in enumerate(sentences):
    print('sentence : {}'.format(i))
    print(sent)
    for word in word_tokenize(sent):
        bagOfwords.append(word)

sentence : 0
Original Article: In computer science, lexical analysis, lexing or 
    tokenization is the process of converting a sequence of characters 
    (such as in a computer program or web page) into a sequence of tokens 
    (strings with an assigned and thus identified meaning).
sentence : 1
A program that performs lexical analysis may be termed a lexer, 
    tokenizer,[1] or scanner, though scanner is also a term for the first stage of a lexer.
sentence : 2
A lexer is generally combined with a parser, which together analyze the syntax of programming languages,
    web pages, and so forth.


## 2.3) Lemmatization & Stemming in nltk

In [60]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saeed.ahmadian\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [74]:
# Stemming
######################
text="List listed lists listing listings"
text="have had has"
words=text.lower().split(' ')
# or
words= word_tokenize(text)

stemmer= PorterStemmer()
## Stem words
stemmed_words=[stemmer.stem(word) for word in words]
# or
[PorterStemmer().stem(word) for word in words]

['have', 'had', 'ha']

In [77]:
####### Lemmatize
lemma= WordNetLemmatizer()
[lemma.lemmatize(word,pos='v') for word in words]

['have', 'have', 'have']

# 3) Torchtext

If you've ever worked on a project for deep learning for NLP, you'll know how painful and tedious all the preprocessing is. Before you start training your model, you have to:

    1) Read the data from disk

    2) Tokenize the text

    3) Create a mapping from word to a unique integer

    4) Convert the text into lists of integers

    5) Load the data in whatever format your deep learning framework requires

    6) Pad the text so that all the sequences are the same length, so you can process them in batch

### 1) Read the data from disk

In [199]:
import pandas as pd
df_train=pd.read_csv('train/train.csv',header='infer')
dd=df_train.head(100)
dd.to_csv('dd.csv')

### 2) Tokenize the text

In [129]:
df_train['comment_text'][0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [139]:
sent="Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted??## They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"
sent= re.sub(r'[^a-zA-Z0-9!?,\.\']',' ',sent)
sent=re.sub(r'([\?!,\.]+)',lambda m: m.group(1)[0],sent)
sent=re.sub(r'[ ]+',' ',sent)
sent= re.sub(r'n\'t',r' not',sent)
# sent= re.sub('\'',r' ddd',sent)
list(nlp(sent).sents)
# stop_words=nltk.corpus.stopwords.words('english')
# stop_words.remove('not')
# doc= nlp(sent)
# [token.lemma_ for token in nltk.tokenize.word_tokenize()]

[Explanation Why the edits made under my username Hardcore Metallica Fan were reverted?,
 They were not vandalisms, just closure on some GAs after I voted at New York Dolls FAC.,
 And please do not remove the template from the talk page since I'm retired now.89.205.38.27]

In [223]:
"Use any of the spacy or nltk methods in any level"
"here I use nltk"

stop_words=nltk.corpus.stopwords.words('english')
stop_words.remove('not')
def CustomTokenizerSpacy(sentence):
#     sentence= re.sub('n\'t',r' not',sentence)
#     sentence=re.sub(r'[^a-zA-Z0-9!?,\.]',' ',sentence)
#     sentence= re.sub(r'([\?!,\.]+)',lambda m: m.group(1)[0],sentence)
#     sentence=re.sub(r'[ ]+',' ',sentence)
    return [token.lemma_ for token in nlp.tokenizer(sentence)]# if token.lemma_ not in nlp.Defaults.stop_words

def CustomTokenizerNltk(sentence):
    sentence=  re.sub(r'[^a-zA-Z0-9!?,\.\']',' ',sentence)
    sentence= re.sub(r'([\?!,\.]+)',lambda m: m.group(1)[0],sentence)
    sentence=re.sub(r'[ ]+',' ',sentence)
    sentence= re.sub('n\'t',r' not',sentence)
#     stemmer=nltk.stem.WordNetLemmatizer()
    stemmer= nltk.stem.PorterStemmer()
    return [stemmer.stem(token) for token in nltk.tokenize.word_tokenize(sentence) if token not in stop_words]



In [229]:
type(df_train['comment_text'][1])

str

In [230]:
tokens_nltk=CustomTokenizerNltk(df_train['comment_text'][1])
tokens_spacy=CustomTokenizerSpacy(df_train['comment_text'][1])
mismatch=[]
# for i in range(len(tokens_nltk)):
#     if tokens_nltk[i]!=tokens_spacy[i]:
#         mismatch.append((tokens_nltk[i],tokens_spacy[i]))
# print(mismatch)
type(tokens_spacy[0])

str

### 3) Create a mapping from word to a unique integer

In [225]:
from torchtext.data import Field
comment_text= Field(sequential=True,use_vocab=True,
                    lower=True,batch_first=True
                   ,tokenize=CustomTokenizerSpacy
                   ) # we don use use_vocab=True because we are using our own toeknizer
toxic= Field(sequential=False,use_vocab=False)
severe_toxic= Field(sequential=False,use_vocab=False)
obscene= Field(sequential=False,use_vocab=False)
threat= Field(sequential=False,use_vocab=False)
insult= Field(sequential=False,use_vocab=False)
identity_hate= Field(sequential=False,use_vocab=False)


In [226]:
import torchtext
torchtext.__version__

'0.7.0'

In [228]:
from torchtext.data import TabularDataset
train_datafields=[('id',None),('comment_text',comment_text),('toxic',toxic),('severe_toxic',severe_toxic),
                 ('obscene',obscene),('threat',threat),('insult',insult),('identity_hate',identity_hate)]
train_set= TabularDataset(path='train/train.csv',format='csv',fields=train_datafields,skip_header=False)

In [231]:
comment_text.build_vocab(train_set)
len(comment_text.vocab)

210590

In [232]:
comment_text.vocab.stoi['went']

16012

In [222]:
comment_text.vocab.itos[664]

'went'

### Using W2vec also we can have:

In [None]:
comment_text.build_vocab(train_set,)