In [2]:
import nltk
import re
from nltk.corpus import stopwords

In [3]:
import spacy
from spacy.tokenizer import Tokenizer

In [1]:
import pandas as pd

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\avant\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

## Text Data Cleaning

In [4]:
line = 'Reaching out for HELP. Please meet me in LONDON at 6 a.m xyz@abc.com #urgent'

In [5]:
# Additional stopwords
extra_list = ['let', 'may', 'might', 'must', 'need', 'apologies', 'meet']
stopword = stopwords.words('english')
stopword.extend(extra_list)

In [6]:
line = ' '.join([i for i in line.split() if i not in stopword])
line = line.lower()

In [7]:
def lemmatize_text(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [8]:
lemmatize_text(line)

['reaching', 'help.', 'please', 'london', '6', 'a.m', 'xyz@abc.com', '#urgent']

In [9]:
def stem_porter(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    ps = nltk.PorterStemmer()
    return [ps.stem(w) for w in w_tokenizer.tokenize(text)]

In [10]:
stem_porter(line)

['reach', 'help.', 'pleas', 'london', '6', 'a.m', 'xyz@abc.com', '#urgent']

In [11]:
line = re.sub('\S*@\S*\s?'," ",line) #email remove
line = re.sub('\s+'," ",line) #new line character remove
line = re.sub("\'"," ",line) #single quote remove
line = re.sub('_'," ",line) #underscore remove
line = re.sub('http\S*\s?'," ",line) #link remove
line = ' '.join([i for i in line.split() if i.find('#') < 0]) #hasgtag remove
line = ' '.join([i for i in line.split() if i in re.findall(r'\w+',line)]) #only keep words and numbers

In [12]:
line

'reaching please london 6'

In [14]:
nlp = spacy.load('en_core_web_sm')
tokens_spacy = nlp(line)
for token in tokens_spacy:
    print(token.text, ': ', token.pos_, ': ', token.is_stop)

reaching :  VERB :  False
please :  INTJ :  True
london :  PROPN :  False
6 :  NUM :  False


In [15]:
for ent in tokens_spacy.ents:
    print(ent.text, ': ', ent.label_)

london :  GPE
6 :  CARDINAL


## Tokenization

In [17]:
str1 = "I am eating pizza and, coke."

In [18]:
# using split()
str1.split()

['I', 'am', 'eating', 'pizza', 'and,', 'coke.']

In [20]:
# using regex
re.findall('[\w]+',str1)

['I', 'am', 'eating', 'pizza', 'and', 'coke']

In [24]:
# using NLTK

In [23]:
# NLTK- word_tokenize
from nltk import word_tokenize
word_tokenize(str1)

['I', 'am', 'eating', 'pizza', 'and', ',', 'coke', '.']

In [26]:
# NLTK - whitespace tokenizer
space_tokenizer = nltk.tokenize.WhitespaceTokenizer()
space_tokenizer.tokenize(str1)

['I', 'am', 'eating', 'pizza', 'and,', 'coke.']

In [28]:
# NLTK - regex tokenizer
reg_tokenizer = nltk.tokenize.RegexpTokenizer('[A-Za-z]+')
reg_tokenizer.tokenize(str1)

['I', 'am', 'eating', 'pizza', 'and', 'coke']

## Vectorization/Word Embedding

In [22]:
str2 = ['I am going to test Covid',
        'It seems ABC hospital is doing the Covid test',
        'Covaxin is still in WIP phase']

In [24]:
# CountVec - count apprearances of a word -> Bag of Words
count_vec = CountVectorizer(analyzer='word', ngram_range=(1, 3), stop_words = 'english')
count_vec.fit(str2)
count = count_vec.transform(str2)
vectors = count_vec.get_feature_names()
smatrix = count_vec.transform(str2)
dense = smatrix.todense()
dense_list = dense.tolist()
df_countvec = pd.DataFrame(dense_list,columns=vectors)



In [25]:
df_countvec

Unnamed: 0,abc,abc hospital,abc hospital doing,covaxin,covaxin wip,covaxin wip phase,covid,covid test,doing,doing covid,...,going test,going test covid,hospital,hospital doing,hospital doing covid,phase,test,test covid,wip,wip phase
0,0,0,0,0,0,0,1,0,0,0,...,1,1,0,0,0,0,1,1,0,0
1,1,1,1,0,0,0,1,1,1,1,...,0,0,1,1,1,0,1,0,0,0
2,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1


In [29]:
#TF-IDF
tfidf_vec = TfidfVectorizer (analyzer='word', ngram_range=(1, 2), stop_words = 'english')
tfidf_vec.fit(str2)
tfidf = tfidf_vec.transform(str2)
vectors = tfidf_vec.get_feature_names()
smatrix = tfidf_vec.transform(str2)
dense = smatrix.todense()
dense_list = dense.tolist()
df_tfidf = pd.DataFrame(dense_list,columns=vectors)



In [30]:
print(vectors)

['abc', 'abc hospital', 'covaxin', 'covaxin wip', 'covid', 'covid test', 'doing', 'doing covid', 'going', 'going test', 'hospital', 'hospital doing', 'phase', 'test', 'test covid', 'wip', 'wip phase']


In [31]:
df_tfidf

Unnamed: 0,abc,abc hospital,covaxin,covaxin wip,covid,covid test,doing,doing covid,going,going test,hospital,hospital doing,phase,test,test covid,wip,wip phase
0,0.0,0.0,0.0,0.0,0.373022,0.0,0.0,0.0,0.490479,0.490479,0.0,0.0,0.0,0.373022,0.490479,0.0,0.0
1,0.350139,0.350139,0.0,0.0,0.26629,0.350139,0.350139,0.350139,0.0,0.0,0.350139,0.350139,0.0,0.26629,0.0,0.0,0.0
2,0.0,0.0,0.447214,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.447214,0.447214
