## Downloading class dataset

In [1]:
!unzip /content/drive/MyDrive/Colab\ Notebooks/datasets/ClimateFeverDataset.zip

Archive:  /content/drive/MyDrive/Colab Notebooks/datasets/ClimateFeverDataset.zip
  inflating: climate-fever.csv       
  inflating: climate-fever.json      
  inflating: climate-fever.jsonl     


## Import need libraries

In [23]:
import pandas as pd
import spacy
from spacy import displacy
from pathlib import Path
from spacy import displacy
import os
# Load the English language model
nlp = spacy.load('en_core_web_sm')

## Importing data

In [3]:
df=pd.read_csv('climate-fever.csv')
df.head()

Unnamed: 0,claim_id,claim,claim_label,evidences/0/evidence_id,evidences/0/evidence_label,evidences/0/article,evidences/0/evidence,evidences/0/entropy,evidences/0/votes/0,evidences/0/votes/1,...,evidences/4/evidence_id,evidences/4/evidence_label,evidences/4/article,evidences/4/evidence,evidences/4/entropy,evidences/4/votes/0,evidences/4/votes/1,evidences/4/votes/2,evidences/4/votes/3,evidences/4/votes/4
0,0,Global warming is driving polar bears toward e...,SUPPORTS,Extinction risk from global warming:170,NOT_ENOUGH_INFO,Extinction risk from global warming,"""Recent Research Shows Human Activity Driving ...",0.693147,SUPPORTS,NOT_ENOUGH_INFO,...,Polar bear:1328,NOT_ENOUGH_INFO,Polar bear,"""Bear hunting caught in global warming debate"".",0.693147,SUPPORTS,NOT_ENOUGH_INFO,,,
1,5,The sun has gone into ‘lockdown’ which could c...,SUPPORTS,Famine:386,SUPPORTS,Famine,The current consensus of the scientific commun...,0.0,SUPPORTS,SUPPORTS,...,Winter:5,NOT_ENOUGH_INFO,Winter,"In many regions, winter is associated with sno...",0.693147,REFUTES,NOT_ENOUGH_INFO,,,
2,6,The polar bear population has been growing.,REFUTES,Polar bear:1332,NOT_ENOUGH_INFO,Polar bear,"""Ask the experts: Are polar bear populations i...",0.693147,NOT_ENOUGH_INFO,REFUTES,...,Polar bear:61,REFUTES,Polar bear,Of the 19 recognized polar bear subpopulations...,0.0,REFUTES,REFUTES,,,
3,9,Ironic' study finds more CO2 has slightly cool...,REFUTES,Atmosphere of Mars:131,NOT_ENOUGH_INFO,Atmosphere of Mars,CO2 in the mesosphere acts as a cooling agent ...,0.693147,NOT_ENOUGH_INFO,SUPPORTS,...,Carbon dioxide:191,NOT_ENOUGH_INFO,Carbon dioxide,"Less energy reaches the upper atmosphere, whic...",0.0,NOT_ENOUGH_INFO,NOT_ENOUGH_INFO,,,
4,10,Human additions of CO2 are in the margin of er...,REFUTES,Carbon dioxide in Earth's atmosphere:140,NOT_ENOUGH_INFO,Carbon dioxide in Earth's atmosphere,While CO 2 absorption and release is always ha...,0.693147,NOT_ENOUGH_INFO,REFUTES,...,Sea:226,REFUTES,Sea,"More recently, anthropogenic activities have s...",0.0,REFUTES,REFUTES,,,


## Preprocessing data

In [62]:
# tasks i'm preforming for preprocessing
def process_claims(claims):
  claim_tokens = []
  claim_tokens_count = []
  claim_wo_stop_words = []
  claim_wo_stop_words_count = []
  claim_pos_tags = []
  claim_pos_tags_count = []
  claim_entities = []
  claim_entities_count = []
  for claim in claims:
    doc = nlp(claim)

    # Perform Named Entity Recognition (NER)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    claim_entities.append(entities)
    claim_entities_count.append(len(entities))

    # Perform Part-of-Speech (POS) tagging
    pos_tags = [(token.text, token.pos_) for token in doc]
    claim_pos_tags.append(pos_tags)
    claim_pos_tags_count.append(len(pos_tags))

    # Tokenization
    tokens = [token.text for token in doc]
    claim_tokens.append(tokens)
    claim_tokens_count.append(len(tokens))

    # Removing stop words
    token_wo_stop = [token for token in doc if not token.is_stop]
    claim_wo_stop_words.append(token_wo_stop)
    claim_wo_stop_words_count.append(len(token_wo_stop))

  return claim_tokens, claim_tokens_count, claim_wo_stop_words, claim_wo_stop_words_count, claim_pos_tags, claim_pos_tags_count, claim_entities, claim_entities_count

# Preprocess the data set and add new columns to the data set
def process_df(df):
  tmp_df = df.copy()
  for column in df:
    # Adding new columns to data set based on preprocesing the claim text
    if("claim" == column):
      claim_tokens, claim_tokens_count, claim_wo_stop_words, claim_wo_stop_words_count, claim_pos_tags, claim_pos_tags_count, claim_entities, claim_entities_count = process_claims(df[column].values)
      tmp_df.insert(2, "claim_tokens", claim_tokens, True)
      tmp_df.insert(3, "claim_tokens_count", claim_tokens_count, True)
      tmp_df.insert(4, "claim_wo_stop_words", claim_wo_stop_words, True)
      tmp_df.insert(5, "claim_wo_stop_words_count", claim_wo_stop_words_count, True)
      tmp_df.insert(6, "claim_pos_tags", claim_pos_tags, True)
      tmp_df.insert(7, "claim_pos_tags_count", claim_pos_tags_count, True)
      tmp_df.insert(8, "claim_entities", claim_entities, True)
      tmp_df.insert(9, "claim_entities_count", claim_entities_count, True)

  return tmp_df


## Process data and show resulting dataframe

In [63]:
# view results of modified dataframe
tmp_df = process_df(df)
tmp_df.tail()

Unnamed: 0,claim_id,claim,claim_tokens,claim_tokens_count,claim_wo_stop_words,claim_wo_stop_words_count,claim_pos_tags,claim_pos_tags_count,claim_entities,claim_entities_count,...,evidences/4/evidence_id,evidences/4/evidence_label,evidences/4/article,evidences/4/evidence,evidences/4/entropy,evidences/4/votes/0,evidences/4/votes/1,evidences/4/votes/2,evidences/4/votes/3,evidences/4/votes/4
1530,3125,About 60% of the warming observed from 1970 to...,"[About, 60, %, of, the, warming, observed, fro...",28,"[60, %, warming, observed, 1970, 2000, likely,...",17,"[(About, ADV), (60, NUM), (%, NOUN), (of, ADP)...",28,"[(About 60%, PERCENT), (1970, DATE), (2000, DA...",4,...,Paleocene–Eocene Thermal Maximum:26,NOT_ENOUGH_INFO,Paleocene–Eocene Thermal Maximum,These can be defined as geologically brief (<2...,0.0,NOT_ENOUGH_INFO,NOT_ENOUGH_INFO,NOT_ENOUGH_INFO,,NOT_ENOUGH_INFO
1531,3127,"""Skeptics hope that Postma’s alternative therm...","["", Skeptics, hope, that, Postma, ’s, alternat...",35,"["", Skeptics, hope, Postma, alternative, therm...",23,"[("", PUNCT), (Skeptics, NOUN), (hope, VERB), (...",35,"[(Postma, PERSON)]",1,...,Theoretical physics:22,NOT_ENOUGH_INFO,Theoretical physics,Theoretical advances may consist in setting as...,0.693147,NOT_ENOUGH_INFO,SUPPORTS,,,
1532,3130,"""There are other possible causes for climate c...","["", There, are, other, possible, causes, for, ...",38,"["", possible, causes, climate, change, associa...",19,"[("", PUNCT), (There, PRON), (are, VERB), (othe...",38,"[(Earth, LOC), (about 5,400 degrees Celsius, Q...",2,...,Earth:111,NOT_ENOUGH_INFO,Earth,"At the center, the temperature may be up to 6,...",0.636514,NOT_ENOUGH_INFO,SUPPORTS,NOT_ENOUGH_INFO,,
1533,3131,We don't need a high heat flow - just a high t...,"[We, do, n't, need, a, high, heat, flow, -, ju...",22,"[need, high, heat, flow, -, high, temperature,...",12,"[(We, PRON), (do, AUX), (n't, PART), (need, VE...",22,[],0,...,Volcano:114,NOT_ENOUGH_INFO,Volcano,"Usually, only mafic flows will erupt as pāhoeh...",0.0,,NOT_ENOUGH_INFO,NOT_ENOUGH_INFO,,
1534,3134,"Over the last decade, heatwaves are five times...","[Over, the, last, decade, ,, heatwaves, are, f...",20,"[decade, ,, heatwaves, times, likely, global, ...",8,"[(Over, ADP), (the, DET), (last, ADJ), (decade...",20,"[(the last decade, DATE), (five, CARDINAL)]",2,...,Heat wave:151,SUPPORTS,Heat wave,The effects of climate change have been projec...,0.0,,SUPPORTS,SUPPORTS,,


### Visualization

In [51]:
# ! rm -rf visualResults/ent/
# Setting up output folders
! mkdir visualResults
! mkdir visualResults/ent
! mkdir visualResults/dep

### Create and output visualization (SVGs)

In [None]:
# Running code on entire dataset
options = {"compact": True, "bg": "#09a3d5","color": "white", "font": "Source Sans Pro"}

for ind in tmp_df.index:
  words = tmp_df["claim_wo_stop_words"][ind]
  listToStr = ' '.join([str(elem) for elem in words])
  doc = nlp(listToStr)

  # produce spacy ent results
  svg = displacy.render(doc, style='ent', minify=True, jupyter=False)
  file_name = "claim-" + str(ind) + "-ent.svg"
  dir_path= Path("~/visualResults/ent/")
  output_path = Path("visualResults/ent/" + file_name)
  isExist = os.path.exists(dir_path)
  output_path.open("w", encoding="utf-8").write(svg)

  # produce spacy dep results
  svg = displacy.render(doc, style="dep",minify=True,jupyter=False, options=options)
  file_name = "claim-" + str(ind) + "-dep.svg"
  dir_path= Path("~/visualResults/dep/")
  output_path = Path("visualResults/dep/" + file_name)
  output_path.open("w", encoding="utf-8").write(svg)

In [59]:
# example of one name entity recognition
words = tmp_df["claim"][1534]
doc = nlp(words)
svg = displacy.render(doc, style='ent', jupyter=False)
file_name = "aclaim-solo-test-dep.svg"
dir_path= Path("~/visualResults/ent/")
output_path = Path("visualResults/ent/" + file_name)
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style='ent', jupyter=True)


In [50]:
# example of one Visualize POS Tags
options = {"compact": True, "bg": "#09a3d5","color": "white", "font": "Source Sans Pro"}
print(doc)
svg = displacy.render(doc, style="dep",minify=True,jupyter=False, options=options)
file_name = "aclaim-solo-test-dep.svg"
dir_path= Path("~/visualResults/dep/")
output_path = Path("visualResults/dep/" + file_name)
output_path.open("w", encoding="utf-8").write(svg)
displacy.render(doc, style='dep', jupyter=True)

Over the last decade, heatwaves are five times more likely than if there had been no global warming.


In [1]:
!jupyter nbconvert --to pdf /Users/Steve/dev/aiMasters/NLP/mod2/NLP_mod2_assignment.ipynb

[NbConvertApp] Converting notebook /Users/Steve/dev/aiMasters/NLP/mod2/NLP_mod2_assignment.ipynb to pdf
Your version must be at least (1.12.1) but less than (3.0.0).
Refer to https://pandoc.org/installing.html.
Continuing with doubts...
  check_pandoc_version()
[NbConvertApp] Writing 55724 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 57654 bytes to /Users/Steve/dev/aiMasters/NLP/mod2/NLP_mod2_assignment.pdf
