In [25]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
p = "The development of a vaccine to fight against SARS-CoV-2 is exciting news, but few of us understand what this means, beyond understanding it’s a possible barrier against COVID-19. How are vaccines developed and how do they work? Will all of the COVID vaccines be the same? How will we know if the vaccine is safe and effective? How often will we need to be vaccinated? Will there be enough vaccine for everyone? Dr. Deborah Fuller, vaccinologist and professor of microbiology at the University of Washington, will discuss the answers to these questions and more. Dr. Fuller’s lab has been the site of development of a vaccine candidate since January 2020."

In [27]:
p

'The development of a vaccine to fight against SARS-CoV-2 is exciting news, but few of us understand what this means, beyond understanding it’s a possible barrier against COVID-19. How are vaccines developed and how do they work? Will all of the COVID vaccines be the same? How will we know if the vaccine is safe and effective? How often will we need to be vaccinated? Will there be enough vaccine for everyone? Dr. Deborah Fuller, vaccinologist and professor of microbiology at the University of Washington, will discuss the answers to these questions and more. Dr. Fuller’s lab has been the site of development of a vaccine candidate since January 2020.'

# PREPROCESSING

In [28]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

#### Step 1- Tokenization: paragraph --> sentences -->  words

In [29]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [30]:
s = nltk.sent_tokenize(p)

In [31]:
print(s)

['The development of a vaccine to fight against SARS-CoV-2 is exciting news, but few of us understand what this means, beyond understanding it’s a possible barrier against COVID-19.', 'How are vaccines developed and how do they work?', 'Will all of the COVID vaccines be the same?', 'How will we know if the vaccine is safe and effective?', 'How often will we need to be vaccinated?', 'Will there be enough vaccine for everyone?', 'Dr. Deborah Fuller, vaccinologist and professor of microbiology at the University of Washington, will discuss the answers to these questions and more.', 'Dr. Fuller’s lab has been the site of development of a vaccine candidate since January 2020.']


In [32]:
print(len(s)) # 8 sentences in the pragraph p

8


In [33]:
stemmer = PorterStemmer()
lematizer = WordNetLemmatizer()

**Cleaning The Paragraph using Regex**

In [34]:
## Most Important Code 

import re
corpus=[]

for i in range(len(s)):
  text = re.sub('[^a-zA-Z]', ' ',s[i])    
  text = text.lower()
  text = text.split()
  text = [lematizer.lemmatize(word) for word in text if word not in set(stopwords.words("english"))]
  text = " ".join(text)
  corpus.append(text)
    

In [35]:
corpus

['development vaccine fight sars cov exciting news u understand mean beyond understanding possible barrier covid',
 'vaccine developed work',
 'covid vaccine',
 'know vaccine safe effective',
 'often need vaccinated',
 'enough vaccine everyone',
 'dr deborah fuller vaccinologist professor microbiology university washington discus answer question',
 'dr fuller lab site development vaccine candidate since january']

In [36]:
corpus_copy = corpus.copy()

#### Step 2 - Stemming and Lemmatization

In [37]:

print("Stemming: ", stemmer.stem("history"))
print("Lemmatizer: " , lematizer.lemmatize("history"))

Stemming:  histori
Lemmatizer:  history


**Appling Stemming and Lemmatization on corpus**

In [38]:
for i in range(len(corpus)):
  print(corpus[i])

development vaccine fight sars cov exciting news u understand mean beyond understanding possible barrier covid
vaccine developed work
covid vaccine
know vaccine safe effective
often need vaccinated
enough vaccine everyone
dr deborah fuller vaccinologist professor microbiology university washington discus answer question
dr fuller lab site development vaccine candidate since january


In [39]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

**Applying Stemming**

In [40]:
for s in corpus_copy:
  words = nltk.word_tokenize(s)
  for word in words:
    if word not in set(stopwords.words('english')):
      print(stemmer.stem(word))

develop
vaccin
fight
sar
cov
excit
news
u
understand
mean
beyond
understand
possibl
barrier
covid
vaccin
develop
work
covid
vaccin
know
vaccin
safe
effect
often
need
vaccin
enough
vaccin
everyon
dr
deborah
fuller
vaccinologist
professor
microbiolog
univers
washington
discu
answer
question
dr
fuller
lab
site
develop
vaccin
candid
sinc
januari


**Applying Lemmatization**

In [41]:
c = []
for s in corpus_copy:
  words = nltk.word_tokenize(s)
  for word in words:
    if word not in set(stopwords.words("english")):
      c.append(word)
      print(word)

development
vaccine
fight
sars
cov
exciting
news
u
understand
mean
beyond
understanding
possible
barrier
covid
vaccine
developed
work
covid
vaccine
know
vaccine
safe
effective
often
need
vaccinated
enough
vaccine
everyone
dr
deborah
fuller
vaccinologist
professor
microbiology
university
washington
discus
answer
question
dr
fuller
lab
site
development
vaccine
candidate
since
january


In [42]:
c[0]

'development'

## Step 3: Bag of Words

In [43]:
from sklearn.feature_extraction.text  import CountVectorizer
countVect = CountVectorizer(binary=True)

In [44]:
x = countVect.fit_transform(corpus)

In [45]:
countVect.vocabulary_

{'answer': 0,
 'barrier': 1,
 'beyond': 2,
 'candidate': 3,
 'cov': 4,
 'covid': 5,
 'deborah': 6,
 'developed': 7,
 'development': 8,
 'discus': 9,
 'dr': 10,
 'effective': 11,
 'enough': 12,
 'everyone': 13,
 'exciting': 14,
 'fight': 15,
 'fuller': 16,
 'january': 17,
 'know': 18,
 'lab': 19,
 'mean': 20,
 'microbiology': 21,
 'need': 22,
 'news': 23,
 'often': 24,
 'possible': 25,
 'professor': 26,
 'question': 27,
 'safe': 28,
 'sars': 29,
 'since': 30,
 'site': 31,
 'understand': 32,
 'understanding': 33,
 'university': 34,
 'vaccinated': 35,
 'vaccine': 36,
 'vaccinologist': 37,
 'washington': 38,
 'work': 39}

In [46]:
corpus[0]

'development vaccine fight sars cov exciting news u understand mean beyond understanding possible barrier covid'

In [47]:
x[0].toarray()

array([[0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0]])

## Ngrams with Bag of Words 

In [51]:
countVect_3grams = CountVectorizer( binary=True, ngram_range=(2,3) )
x_3grams = countVect_3grams.fit_transform(corpus)
x_3grams[0].toarray()

array([[0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
        1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0]])

In [52]:
countVect_3grams.vocabulary_

{'answer question': 0,
 'barrier covid': 1,
 'beyond understanding': 2,
 'beyond understanding possible': 3,
 'candidate since': 4,
 'candidate since january': 5,
 'cov exciting': 6,
 'cov exciting news': 7,
 'covid vaccine': 8,
 'deborah fuller': 9,
 'deborah fuller vaccinologist': 10,
 'developed work': 11,
 'development vaccine': 12,
 'development vaccine candidate': 13,
 'development vaccine fight': 14,
 'discus answer': 15,
 'discus answer question': 16,
 'dr deborah': 17,
 'dr deborah fuller': 18,
 'dr fuller': 19,
 'dr fuller lab': 20,
 'enough vaccine': 21,
 'enough vaccine everyone': 22,
 'exciting news': 23,
 'exciting news understand': 24,
 'fight sars': 25,
 'fight sars cov': 26,
 'fuller lab': 27,
 'fuller lab site': 28,
 'fuller vaccinologist': 29,
 'fuller vaccinologist professor': 30,
 'know vaccine': 31,
 'know vaccine safe': 32,
 'lab site': 33,
 'lab site development': 34,
 'mean beyond': 35,
 'mean beyond understanding': 36,
 'microbiology university': 37,
 'microbi