In [None]:
!pip install py2neo
!pip install wikipedia
!pip install spacy==3.0.3

In [1]:
import wikipedia as wk

In [2]:
wk.search("climate change")

['Climate change',
 'Climate change denial',
 'United Nations Framework Convention on Climate Change',
 'Effects of climate change',
 'Intergovernmental Panel on Climate Change',
 'Climate change mitigation',
 'Climate variability and change',
 'Climate change and agriculture',
 '2021 in climate change',
 'Paris Agreement']

In [None]:
wk.page('Climate change').summary

In [None]:
!pip install Wikipedia-API

In [4]:
import wikipediaapi as wkapi

In [5]:
wiki_climate = wkapi.Wikipedia(
        language='en',
        extract_format=wkapi.ExtractFormat.WIKI
)

In [None]:
# wiki_climate.page('Climate change').text

In [33]:
my_file = {}
for items in wk.search('climate change', results = 500):
  my_file[items]=wiki_climate.page(items).summary

In [6]:
import json

In [42]:
with open("sample.json", "w") as fp:
    json.dump(my_file, fp) 

In [121]:
"""Opening the json file"""
with open('/content/sample.json') as d:
    my_file = json.load(d)
# print(my_file)

# **Spacy Library**

In [127]:
"""Spacy Library"""
import spacy
# !python -m spacy download en_core_web_md
# !python -m spacy download en
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('merge_noun_chunks')
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'ner',
 'attribute_ruler',
 'lemmatizer',
 'merge_noun_chunks']

In [101]:
"""Trying Spacy functions on one sentence"""
sentence = nlp("'This article documents notable events, research findings, effects, and responses related to global warming and climate change during the year 2021")

In [47]:
"""Lemmatization"""
[word.lemma_ for word in sentence]

["'",
 'this article',
 'document',
 'notable event',
 ',',
 'research finding',
 ',',
 'effect',
 ',',
 'and',
 'response',
 'relate',
 'to',
 'global warming',
 'and',
 'climate change',
 'during',
 'the year',
 '2021']

In [48]:
"""Part of Speech (pos)
Bae the words as Noun, Pronouns, verbs adj etc."""
[word.pos_ for word in sentence]

['PUNCT',
 'NOUN',
 'VERB',
 'NOUN',
 'PUNCT',
 'NOUN',
 'PUNCT',
 'NOUN',
 'PUNCT',
 'CCONJ',
 'NOUN',
 'VERB',
 'ADP',
 'NOUN',
 'CCONJ',
 'NOUN',
 'ADP',
 'NOUN',
 'NUM']

In [49]:
"""Extracting Only Adjectives"""
[word for word in sentence if word.pos_ == 'VERB']

[documents, related]

In [19]:
"""Extracting Verbs"""
print("Verbs in our sentence %s"%[word for word in sentence if word.pos_ == 'VERB'])


"""Extracting Lemmatized version of Verbs"""
print("Lemmatized Version of these Verbs %s" %[word.lemma_ for word in sentence if word.pos_ == 'VERB'])

Verbs in our sentence [documents, related]
Lemmatized Version of these Verbs ['document', 'relate']


In [21]:
"""Taking care of 's """
def text_cleaner(sentence):
  cleaned_sentence = sentence.replace("'s", "")
  return cleaned_sentence

text_cleaner('conserve Canada\'s renewable resources; conserve and protect Canada\'s water resources')  

'conserve Canada renewable resources; conserve and protect Canada water resources'

In [22]:
"""Importing Stopwords from spacy library"""
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
import string
punctuations = string.punctuation

In [23]:
"""Tokenizer"""
def tokenizer(text):
  words = nlp(text)
  words = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in words]
  words = [word for word in words if (word not in stopwords and word not in punctuations)]
  return words

In [24]:
tokenizer(text_cleaner('conserve Canada\'s renewable resources; conserve and protect Canada\'s water resources')  )

['conserve',
 'canada renewable resource',
 'conserve',
 'protect',
 'canada water resource']

In [50]:
"""Applying cleaning Function on the data we collected"""
cleaned_text = " ".join([text_cleaner(text) for text in my_file.values()])

In [51]:
"""Tokenizing the cleaned data"""
tokenized_text = tokenizer(cleaned_text)

In [52]:
"""Combining Everything"""
def file_clean_tokenize(file_dict):
  cleaned_text = " ".join([text_cleaner(text) for text in my_file.values()])
  tokenized_text = tokenizer(cleaned_text)
  return tokenized_text

In [53]:
def word_map(file_dict):
  word_map = {}
  word_list = file_clean_tokenize(file_dict)

  for a_word in word_list:
    word_map[a_word] = word_map.get(a_word, 0) + 1
    word_map_sorted = {k: v for k,v in sorted(word_map.items(), key = lambda item : item[1], reverse = True)}

  return word_map_sorted

sorted_word_map = word_map(my_file) 

In [54]:
npl_string = nlp(" ".join(tokenized_text))

# SUBJECTS = [word for word in sentence if word.pos_ == 'ADJ']
# tokenized_text

In [92]:
# [word.dep_ for word in nlp(" ".join(tokenized_text))]
npl_string[0:4]

climate change include both global warming drive human-induce emission greenhouse gas the result large-scale shift weather pattern

In [97]:
from spacy import displacy
displacy.render(npl_string[3:6],style = 'dep', jupyter = True)

In [100]:
print(spacy.explain('appos'))

appositional modifier
None


In [None]:
from spacy import displacy
displacy.render(npl_string[0:200],style = 'ent', jupyter = True)

In [114]:
options = {'compact': True, "bg": 'cornflowerblue', 'color': '#fff',
           'font':'Sans Serif'}
displacy.render(npl_string[0:6],style = 'dep', jupyter = True, options = options)           