# COVID-19 CoronaWhy NLP N-grams (Bigrams & Trigrams)

Memory is a concern for this task so you'll see a few instances of some memory clean ups.

### Let's import all the tools we will need

In [1]:
import pandas as pd
import nltk, re, string, collections
from nltk.util import ngrams # function for making ngrams
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Now we will load the data from one of our CoronaWhy datasets.

In [2]:
df = pd.read_csv('/kaggle/input/coronawhy/dataset_v6.csv')

### Setting this column as text column to make the data easier to process.

In [3]:
df['text'] = df['text'].astype(str)
df['text'] = df['text'].str.lower()

### Filter the data by keywords.  This is recommended because there is a LOT of data to parse through.  In this section we will filter the data by anything that contains the word 'age'.

In [4]:
filter_keywords = ['age']
df = df[df['text'].str.contains('|'.join(filter_keywords))]

### Combining the text to search so we can process it for later.

In [5]:
text_to_search = ' '.join(df["text"])

### Now that we have our text loaded, let's delete the data frame to save some memory

In [6]:
del df

### Removing punctuation since we don't need that for N-grams.

In [7]:
# get rid of punctuation
punctuationNoPeriod = "[" + re.sub("\.","",string.punctuation) + "]"
text_to_search = re.sub(punctuationNoPeriod, "", text_to_search)

### Removing stop words.  We'll use the English stop words from NLTK plus some customized stop words we've been using for COVID-19

In [8]:
# let's remove stop words
# we will use the stop words provided by the NLTK
# we will also add in some customized stop words used in other places for COVID-19

customized_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'fig', 'fig.', 'al.', 'q', 'license',
    'di', 'la', 'il', 'del', 'le', 'della', 'dei', 'delle', 'una', 'da',  'dell',  'non', 'si', 'holder',
    'p', 'h'
]

stop_words = list(stopwords.words('english')) + customized_stop_words
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Let's tokenize the text and remove the stop words (this takes a while depending on the size of the data)

In [9]:
# let's tokenize the words
text_tokens = word_tokenize(text_to_search)
text_to_search = [word for word in text_tokens if not word in stop_words]

### Now we'll start with Bigrams.

In [10]:
# and get a list of all the bigrams
esBigrams = ngrams(text_to_search, 2)

# get the frequency of each bigram in our corpus
esBigramFreq = collections.Counter(esBigrams)

# what are the ten most popular bigrams
esBigramFreq.most_common(25)

[(('public', 'health'), 21300),
 (('table', '1'), 18098),
 (('influenza', 'virus'), 15174),
 (('immune', 'response'), 13972),
 (('amino', 'acid'), 13442),
 (('respiratory', 'tract'), 13011),
 (('1', '2'), 12991),
 (('infectious', 'diseases'), 12057),
 (('acute', 'respiratory'), 11539),
 (('e', 'coli'), 11220),
 (('2', '3'), 10674),
 (('epithelial', 'cells'), 10194),
 (('immune', 'responses'), 10138),
 (('table', '2'), 10041),
 (('respiratory', 'syndrome'), 9953),
 (('viral', 'rna'), 9947),
 (('united', 'states'), 9864),
 (('clinical', 'signs'), 9671),
 (('may', 'also'), 9527),
 (('health', 'care'), 9134),
 (('gene', 'expression'), 9094),
 (('viral', 'infection'), 8639),
 (('virus', 'infection'), 8603),
 (('infected', 'cells'), 8601),
 (('3', '4'), 8452)]

### Now let's look at Trigrams.

In [11]:
# and get a list of all the trigrams
esTrigrams = ngrams(text_to_search, 3)

# get the frequency of each trigram in our corpus
esTrigramFreq = collections.Counter(esTrigrams)

# what are the ten most popular trigrams
esTrigramFreq.most_common(25)

[(('severe', 'acute', 'respiratory'), 7199),
 (('acute', 'respiratory', 'syndrome'), 6499),
 (('respiratory', 'syncytial', 'virus'), 4272),
 (('1', '2', '3'), 3745),
 (('lower', 'respiratory', 'tract'), 3599),
 (('according', 'manufacturers', 'instructions'), 3577),
 (('respiratory', 'syndrome', 'sars'), 3366),
 (('polymerase', 'chain', 'reaction'), 3107),
 (('2', '3', '4'), 3027),
 (('three', 'independent', 'experiments'), 2918),
 (('respiratory', 'tract', 'infections'), 2864),
 (('world', 'health', 'organization'), 2816),
 (('central', 'nervous', 'system'), 2764),
 (('upper', 'respiratory', 'tract'), 2747),
 (('3', '4', '5'), 2654),
 (('western', 'blot', 'analysis'), 2630),
 (('5', '6', '7'), 2560),
 (('6', '7', '8'), 2553),
 (('4', '5', '6'), 2534),
 (('respiratory', 'syndrome', 'coronavirus'), 2478),
 (('7', '8', '9'), 2437),
 (('8', '9', '10'), 2349),
 (('9', '10', '11'), 2332),
 (('11', '12', '13'), 2311),
 (('10', '11', '12'), 2293)]

### Cleaning up some RAM here since we don't have unlimited memory with Kaggle

In [12]:
del esBigrams
del esBigramFreq
del esTrigrams
del esTrigramFreq

### Now we will look for Bigrams and Trigrams with specific words.

In [13]:
search_for_word = 'age' # Text we want the Bi/Trigrams to contain

# reset the Bigrams
esBigrams = ngrams(text_to_search, 2)
esBigramFreq = collections.Counter(esBigrams)

### Now let's show the Bigrams containing our search word

In [14]:
for gram, freq in esBigramFreq.most_common():
    if gram[0] == search_for_word or gram[1] == search_for_word:
        print(gram, freq)

('years', 'age') 8425
('age', 'group') 4675
('age', 'groups') 4662
('months', 'age') 4036
('weeks', 'age') 3656
('mean', 'age') 3132
('median', 'age') 2898
('age', 'sex') 2446
('days', 'age') 2267
('age', 'gender') 1500
('year', 'age') 1246
('children', 'age') 1087
('older', 'age') 1021
('age', 'distribution') 957
('gestational', 'age') 946
('sex', 'age') 927
('different', 'age') 906
('average', 'age') 843
('gender', 'age') 812
('patients', 'age') 762
('age', 'range') 749
('age', 'patients') 747
('age', '5') 705
('younger', 'age') 691
('age', '2') 669
('old', 'age') 627
('age', '6') 563
('age', '3') 559
('young', 'age') 557
('increasing', 'age') 552
('age', 'older') 546
('age', '1') 512
('according', 'age') 503
('patient', 'age') 443
('age', 'onset') 427
('age', '65') 413
('advanced', 'age') 408
('age', 'children') 396
('including', 'age') 395
('age', 'time') 374
('month', 'age') 365
('age', '18') 358
('adjusted', 'age') 356
('age', 'first') 353
('increased', 'age') 344
('age', '4') 31

### Clean up memory again

In [15]:
del esBigrams
del esBigramFreq

### Now let's show the Trigrams containing our search word

In [16]:
# reset the Trigrams
esTrigrams = ngrams(text_to_search, 3)
esTrigramFreq = collections.Counter(esTrigrams)

In [17]:
for gram, freq in esTrigramFreq.most_common():
    if gram[0] == search_for_word or gram[1] == search_for_word or gram[2] == search_for_word:
        print(gram, freq)

('5', 'years', 'age') 1381
('2', 'years', 'age') 994
('6', 'months', 'age') 834
('1', 'year', 'age') 812
('different', 'age', 'groups') 705
('3', 'years', 'age') 490
('18', 'years', 'age') 395
('3', 'weeks', 'age') 393
('3', 'months', 'age') 390
('12', 'months', 'age') 387
('65', 'years', 'age') 364
('6', 'weeks', 'age') 355
('age', '5', 'years') 324
('2', 'weeks', 'age') 304
('4', 'weeks', 'age') 283
('8', 'weeks', 'age') 278
('2', 'months', 'age') 273
('years', 'age', 'older') 260
('6', 'years', 'age') 260
('15', 'years', 'age') 260
('10', 'years', 'age') 253
('age', '2', 'years') 246
('12', 'weeks', 'age') 246
('4', 'years', 'age') 245
('across', 'age', 'groups') 225
('12', 'years', 'age') 224
('24', 'months', 'age') 214
('60', 'years', 'age') 212
('4', 'months', 'age') 208
('children', 'age', '5') 197
('age', '65', 'years') 194
('1', 'month', 'age') 190
('patients', 'mean', 'age') 183
('older', 'age', 'groups') 183
('16', 'weeks', 'age') 182
('median', 'age', 'patients') 181
('10',