# Finding_Most_Common_Words

In [1]:
import spacy
from collections import Counter
nlp = spacy.load('en_core_web_sm')

In [2]:
doc_covid = nlp(open('covid_research.txt').read())
doc_covid

Dr Sonya Babu-Narayan, Associate Medical Director at the British Heart Foundation and Honorary Consultant Cardiologist, said: 

“Every day we learn more about Covid-19. Information to date suggests that people with heart disease, or are at risk of heart disease due to factors such as high blood pressure, diabetes or being severely overweight with a body mass index higher than 40, are at an increased risk of complications caused by the virus.

“If you have one of these conditions you should be taking all precautions possible to reduce your chance of catching the virus.

“Viruses can cause significant inflammation which can injure the heart and can worsen a person’s existing heart condition even if the virus does not enter the heart directly.

“Evidence shows that people with higher levels of a protein used to measure heart injury in their blood are more likely to die after contracting Covid-19. 

“However this kind of observational evidence can’t tell us why some people suffer heart dam

### Removing Punctuations and Stop Words

In [5]:
nouns = [token.text for token in doc_covid if token.is_stop == False and token.pos_ == 'NOUN' and token.is_punct == False]
nouns

['day',
 'Covid-19',
 'Information',
 'date',
 'people',
 'heart',
 'disease',
 'risk',
 'heart',
 'disease',
 'factors',
 'blood',
 'pressure',
 'diabetes',
 'body',
 'mass',
 'index',
 'risk',
 'complications',
 'virus',
 'conditions',
 'precautions',
 'chance',
 'virus',
 'Viruses',
 'inflammation',
 'heart',
 'person',
 'heart',
 'condition',
 'virus',
 'heart',
 'people',
 'levels',
 'protein',
 'heart',
 'injury',
 'blood',
 'Covid-19',
 'kind',
 'evidence',
 'people',
 'heart',
 'damage',
 'virus',
 'outcomes',
 'coronavirus',
 'heart',
 'lives',
 'severity',
 'speed',
 'impact',
 'quality',
 'evidence',
 'case',
 'series',
 'patients',
 'COVID-19',
 'conclusions',
 'impact',
 'treatments',
 'pathophysiology',
 'disease',
 'mortality',
 'association',
 'injury',
 'infection',
 'mortality',
 'nature',
 'data',
 'link',
 'factors',
 'addition',
 'injury',
 'death',
 'individual',
 'data',
 'sample',
 'patients',
 'hospital',
 'factors',
 'hospitalisation',
 'countries',
 'regions'

In [6]:
len(nouns)

281

### Finding Most Common Nouns

In [15]:
noun_freq = Counter(nouns)

In [16]:
common_nouns = noun_freq.most_common(20)
common_nouns

[('heart', 29),
 ('disease', 12),
 ('people', 9),
 ('virus', 8),
 ('studies', 7),
 ('patients', 6),
 ('effects', 6),
 ('risk', 5),
 ('factors', 5),
 ('evidence', 5),
 ('injury', 4),
 ('damage', 4),
 ('infection', 4),
 ('study', 4),
 ('blood', 3),
 ('complications', 3),
 ('inflammation', 3),
 ('coronavirus', 3),
 ('case', 3),
 ('mortality', 3)]

### Most Common Verbs

In [11]:
# Some stop words can be verbs

verbs = [token.text for token in doc_covid if token.is_stop == False and token.pos_ == 'VERB' and token.is_punct == False]
verbs


['said',
 'learn',
 'suggests',
 'increased',
 'caused',
 'taking',
 'reduce',
 'catching',
 'cause',
 'injure',
 'worsen',
 'existing',
 'enter',
 'shows',
 'measure',
 'die',
 'contracting',
 'tell',
 'suffer',
 'caused',
 'lead',
 'continue',
 'research',
 'affects',
 'help',
 'save',
 'improve',
 'said',
 'Given',
 'draw',
 'follows',
 'infected',
 'try',
 'draw',
 'report',
 'Given',
 'identify',
 'identified',
 'depend',
 'associated',
 'depend',
 'reports',
 'involved',
 'search',
 'included',
 'Given',
 'known',
 'found',
 'concerning',
 'helping',
 'direct',
 'affected',
 'said',
 'appears',
 'based',
 'confirm',
 'suggests',
 'makes',
 'focus',
 'given',
 'supporting',
 'supporting',
 'said',
 'comment',
 'tell',
 'indicate',
 'happen',
 'says',
 'happen',
 'throws',
 'examined',
 'hospitalised',
 'hospitalised',
 'tell',
 'including',
 'existing',
 'understood',
 'increase',
 'tie',
 'underlying',
 'hospitalised',
 'said',
 'provide',
 'affect',
 'examine',
 'needed',
 'cons

In [12]:
verb_freq = Counter(verbs)

In [14]:
common_verbs = verb_freq.most_common(20)
print(dict(common_verbs))

{'said': 6, 'existing': 4, 'tell': 3, 'Given': 3, 'happen': 3, 'hospitalised': 3, 'suggests': 2, 'caused': 2, 'help': 2, 'draw': 2, 'depend': 2, 'associated': 2, 'appears': 2, 'given': 2, 'supporting': 2, 'including': 2, 'needed': 2, 'published': 2, 'dealing': 2, 'learn': 1}
