In [1]:
# Text data
import pandas as pd

In [2]:
data = [('we are selling books',1),
          ('we sell books',1),
          ('we are store for books',1),
          ('we are building apps',0),
          ('apps store',0)]
data

[('we are selling books', 1),
 ('we sell books', 1),
 ('we are store for books', 1),
 ('we are building apps', 0),
 ('apps store', 0)]

In [3]:
df = pd.DataFrame(data, columns=['text', 'label'])
df

Unnamed: 0,text,label
0,we are selling books,1
1,we sell books,1
2,we are store for books,1
3,we are building apps,0
4,apps store,0


In [4]:
# Very naive tokenizer that assigns a number to word
token = {}
i=0
for row in df['text'].str.split():
  for w in row:
    print("Token" + str(i) + ":" + w)
    token[i]=w
    i = i+1

Token0:we
Token1:are
Token2:selling
Token3:books
Token4:we
Token5:sell
Token6:books
Token7:we
Token8:are
Token9:store
Token10:for
Token11:books
Token12:we
Token13:are
Token14:building
Token15:apps
Token16:apps
Token17:store


In [5]:
token

{0: 'we',
 1: 'are',
 2: 'selling',
 3: 'books',
 4: 'we',
 5: 'sell',
 6: 'books',
 7: 'we',
 8: 'are',
 9: 'store',
 10: 'for',
 11: 'books',
 12: 'we',
 13: 'are',
 14: 'building',
 15: 'apps',
 16: 'apps',
 17: 'store'}

In [6]:
#[we, are, selling, books] -> [0,1,2,3]

In [7]:
# Build a simple rule that identifies Fraud

In [26]:
data

[('we are selling books', 1),
 ('we sell books', 1),
 ('we are store for books', 1),
 ('we are building apps', 0),
 ('apps store', 0)]

In [8]:
# Smarter way is to use Bag of Words approach
from collections import Counter
C_F = Counter()
C_NF = Counter()
for review,label in data:
  if label == 1:
    C_F.update(review.split())
  else:
    C_NF.update(review.split())

In [9]:
C_F, C_NF

(Counter({'we': 3,
          'are': 2,
          'selling': 1,
          'books': 3,
          'sell': 1,
          'store': 1,
          'for': 1}),
 Counter({'we': 1, 'are': 1, 'building': 1, 'apps': 2, 'store': 1}))

In [27]:
test = 'we sell awesome books'
test.split()

['we', 'sell', 'awesome', 'books']

In [10]:
test = 'we sell awesome books' # Simple Fraud Model -> 3+1+0+3=7, 1+0+0+0=1 => No of Fraud Words > No of Non Fraud Words = Fraud, No of Non Fraud Words >  No of Fraud Words => Non Fraud
F, NF = 0, 0
for i in test.split():
  F  = F + C_F[i]
  NF = NF + C_NF[i]
F, NF

(7, 1)

In [28]:
if F > NF:
  print("This document is fraud")
else:
  print("This document is non fraud")

This document is fraud


In [11]:
# Build a simple rule that identifies Fraud -> Fraud > Non Fraud

# NLTK vs Spacy

In [2]:
text = "We are learning Natural Language Processing today"

In [3]:
# nltk implementation
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

#spaCy Code Initialization:
import spacy
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [4]:
## Tokenize Sentences

In [5]:
#NLTK
nltk_tokenList = word_tokenize(text)
nltk_tokenList

['We', 'are', 'learning', 'Natural', 'Language', 'Processing', 'today']

In [16]:
text_spacy = nlp(text)

In [17]:
#spaCy
token_text = []
for token in text_spacy:
    token_text.append(token.text)
print(token_text)

['We', 'are', 'learning', 'Natural', 'Language', 'Processing', 'today']


In [18]:
# Stemming - Stemming reduces a word to its root or base form by chopping off suffixes

In [19]:
text = "We are studying Natural Language Processing today."

from nltk.stem.snowball import SnowballStemmer
Stemmer = SnowballStemmer(language='english')
for token in text.split(" "):
  print(token,'->',Stemmer.stem(token))

We -> we
are -> are
studying -> studi
Natural -> natur
Language -> languag
Processing -> process
today. -> today.


In [20]:
#spaCy -  Lemmatization reduces a word to its dictionary or base form (lemma) based on its meaning and part of speech
print('Tokenized Text')
print(token_text)
lemma_text = []
for token in text_spacy:
    lemma_text.append(token.lemma_)
print("Tokenize+Lemmatize:")
print(lemma_text)

Tokenized Text
['We', 'are', 'learning', 'Natural', 'Language', 'Processing', 'today']
Tokenize+Lemmatize:
['we', 'be', 'learn', 'Natural', 'Language', 'Processing', 'today']


In [24]:
# Remove Stopwords
#NLTK
text = ['We', 'are', 'learning', 'Natural', 'Language', 'Processing', 'today']
print(text)
filtered_sentence = []
nltk_stop_words = set(stopwords.words("english"))
for w in text:
    if w not in nltk_stop_words:
        filtered_sentence.append(w)

print(filtered_sentence)


['We', 'are', 'learning', 'Natural', 'Language', 'Processing', 'today']
['We', 'learning', 'Natural', 'Language', 'Processing', 'today']


In [25]:
#Spacey
text = ['We', 'are', 'learning', 'Natural', 'Language', 'Processing', 'today']
print(text)
filtered_sentence_spacy =[]
for word in text:
    lexeme = nlp.vocab[word]
    if lexeme.is_stop == False:
        filtered_sentence_spacy.append(word)
print(filtered_sentence_spacy)

['We', 'are', 'learning', 'Natural', 'Language', 'Processing', 'today']
['learning', 'Natural', 'Language', 'Processing', 'today']


In [22]:
# Remove Punctuation - Same in NLTP & Spacy
text = 'We study Natural Language Processing today !!'
print(text)
punctuations=r'[?:!.,;]'
text_clean = ''.join(re.sub(punctuations,'',text))
text_clean.strip()

We study Natural Language Processing today !!


'We study Natural Language Processing today'