In [14]:
import spacy
import json
import pandas as pd

In [12]:
! python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
nlp = spacy.load("en_core_web_sm")

In [16]:
with open("data.json","r") as data_json:
    data = json.load(data_json)

In [20]:
text=data[0]['text'].lower()

In [24]:
text

'a pandemic (from greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. a widespread endemic disease with a stable number of infected people is not a pandemic. widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nthroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. the most fatal pandemic in recorded history was the black death (also known as the plague), which killed an estimated 75–200 million people in the 14th century. the term was not used yet but was for later pandemics including the 1918 influenza pandemic (spanish flu). current pandemics include covid-19 (sars-cov-2) and hiv/aids.'

In [21]:
lemmas = [t.lemma_ for t in nlp(text)]

In [25]:
lemmes_text=' '.join(lemmas)

In [26]:
lemmes_text

'a pandemic ( from greek πᾶν , pan , " all " and δῆμος , demos , " people " ) be an epidemic of an infectious disease that have spread across a large region , for instance multiple continent or worldwide , affect a substantial number of people . a widespread endemic disease with a stable number of infected people be not a pandemic . widespread endemic disease with a stable number of infected people such as recurrence of seasonal influenza be generally exclude as they occur simultaneously in large region of the globe rather than be spread worldwide . \n throughout human history , there have be a number of pandemic of disease such as smallpox and tuberculosis . the most fatal pandemic in recorded history be the black death ( also know as the plague ) , which kill an estimate 75–200 million people in the 14th century . the term be not use yet but be for later pandemic include the 1918 influenza pandemic ( spanish flu ) . current pandemic include covid-19 ( sars - cov-2 ) and hiv / aids .'

In [27]:
text_without_Stop_Words = [t.text for t in nlp(lemmes_text) if not t.is_stop]

In [31]:
cleaned_text = ' '.join(text_without_Stop_Words)

In [32]:
cleaned_text

'pandemic ( greek πᾶν , pan , " " δῆμος , demos , " people " ) epidemic infectious disease spread large region , instance multiple continent worldwide , affect substantial number people . widespread endemic disease stable number infected people pandemic . widespread endemic disease stable number infected people recurrence seasonal influenza generally exclude occur simultaneously large region globe spread worldwide . \n  human history , number pandemic disease smallpox tuberculosis . fatal pandemic recorded history black death ( know plague ) , kill estimate 75–200 million people 14th century . term use later pandemic include 1918 influenza pandemic ( spanish flu ) . current pandemic include covid-19 ( sars - cov-2 ) hiv / aids .'

In [33]:
tokens_without_punct_spacy = [t.text for t in nlp(cleaned_text) if t.pos_ != 'PUNCT']

In [35]:
f"Spacy based removal: {tokens_without_punct_spacy}"

"Spacy based removal: ['pandemic', 'greek', 'πᾶν', 'pan', 'δῆμος', 'demos', 'people', 'epidemic', 'infectious', 'disease', 'spread', 'large', 'region', 'instance', 'multiple', 'continent', 'worldwide', 'affect', 'substantial', 'number', 'people', 'widespread', 'endemic', 'disease', 'stable', 'number', 'infected', 'people', 'pandemic', 'widespread', 'endemic', 'disease', 'stable', 'number', 'infected', 'people', 'recurrence', 'seasonal', 'influenza', 'generally', 'exclude', 'occur', 'simultaneously', 'large', 'region', 'globe', 'spread', 'worldwide', '\\n  ', 'human', 'history', 'number', 'pandemic', 'disease', 'smallpox', 'tuberculosis', 'fatal', 'pandemic', 'recorded', 'history', 'black', 'death', 'know', 'plague', 'kill', 'estimate', '75–200', 'million', 'people', '14th', 'century', 'term', 'use', 'later', 'pandemic', 'include', '1918', 'influenza', 'pandemic', 'spanish', 'flu', 'current', 'pandemic', 'include', 'covid-19', 'sars', 'cov-2', 'hiv', '/', 'aids']"

In [36]:
!pip install normalise

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting normalise
  Downloading normalise-0.1.8-py3-none-any.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting roman
  Downloading roman-3.3-py2.py3-none-any.whl (3.9 kB)
Installing collected packages: roman, normalise
Successfully installed normalise-0.1.8 roman-3.3
[0m

In [62]:
import string
def remove_punct( doc):
    return [t for t in doc if t.text not in string.punctuation]

def remove_stop_words(doc):
        return [t for t in doc if not t.is_stop]
    
def lemmatize(doc):
        return ' '.join([t.lemma_ for t in doc])
    
def preprocess_text(text):
        doc = nlp(text.lower())
        removed_punct = remove_punct(doc)
        removed_stop_words = remove_stop_words(removed_punct)
        return lemmatize(removed_stop_words)

In [63]:
preprocess_text(data[0]['text'])

'pandemic greek πᾶν pan δῆμος demos people epidemic infectious disease spread large region instance multiple continent worldwide affect substantial number people widespread endemic disease stable number infected people pandemic widespread endemic disease stable number infected people recurrence seasonal influenza generally exclude occur simultaneously large region globe spread worldwide \n human history number pandemic disease smallpox tuberculosis fatal pandemic recorded history black death know plague kill estimate 75–200 million people 14th century term later pandemic include 1918 influenza pandemic spanish flu current pandemic include covid-19 sars cov-2 hiv aids'

In [64]:
import pandas as pd

In [68]:
df=pd.read_json('data.json')

In [69]:
df.head()

Unnamed: 0,title,text,url
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality


In [70]:
df['tokenized_text'] = df['text'].map(preprocess_text)

In [71]:
df.head()

Unnamed: 0,title,text,url,tokenized_text
0,Pandemic,"A pandemic (from Greek πᾶν, pan, ""all"" and δῆμ...",https://en.wikipedia.org/wiki/Pandemic,pandemic greek πᾶν pan δῆμος demos people epid...
1,Epidemiology of HIV/AIDS,"HIV/AIDS, or Human Immunodeficiency Virus, is ...",https://en.wikipedia.org/wiki/Epidemiology_of_...,hiv aids human immunodeficiency virus consider...
2,Antonine Plague,"The Antonine Plague of 165 to 180 AD, also kno...",https://en.wikipedia.org/wiki/Antonine_Plague,antonine plague 165 180 ad know plague galen g...
3,Basic reproduction number,"In epidemiology, the basic reproduction number...",https://en.wikipedia.org/wiki/Basic_reproducti...,epidemiology basic reproduction number basic r...
4,Bills of mortality,Bills of mortality were the weekly mortality s...,https://en.wikipedia.org/wiki/Bills_of_mortality,bill mortality weekly mortality statistic lond...


In [72]:
df.to_json('tokenized.json')