In [1]:
import re
import pandas as pd

In [2]:
# Determines whether two words appear within n words of each other
def near(string, word1, word2, dist):
  p = re.compile(f'{word1}\\s(\\w+[\\s,.]*){{0,{dist-2}}}{word2}', re.IGNORECASE)
  p1 = re.compile(f'{word2}\\s(\\w+[\\s,.]*){{0,{dist-2}}}{word1}', re.IGNORECASE)
  return p.search(string) is not None or p1.search(string) is not None

In [3]:
def apply_queries(str):

  # Step 1: single terms
  singles = ["climate action\w*","climate effect\w*", "climate model\w*", "climate variability",
             "climate variation\w*", "climate-driven", "climatology", "eco-innovation\w*",
             "environmental change\w*","environmental impact", "global climate", "global warming",
             "greenhouse effect\w*","green-house effect\w*", "greenhouse gas\w*", "green-house gas\w*",
             "UNFCC", "United Nations Framework Convention for Climate Change"]
  for s in singles:
    if (re.search(s, str, re.IGNORECASE) != None):
      return True

  # Step 2: "climate" + nearby terms
  pair_term_1 = "climat\w*"
  pair_terms_2 = ["(polic\w*|strateg\w*|plan|plans|planning)",
                  "(information|awareness|educat\w*|teach\w*|learn\w*)",
                  "(fund|funds|funding|money|dollar\w*|commitment|capitali\w*)",
                  "anthropogenic\w*", "action\w*", "adapt\w*", "biodiversity",
                  "carbon\w*", "change\w*", "crisis", "deforestati\w*", "desertificati\w*",
                  "ecolog\w*", "environment\w*", "GHG", "global change", "greenhouse gas\w*",
                  "hazard\w*", "reforestati\w*", "variabilit\w*", "warming", "water stress"]

  for t in pair_terms_2:
    if (near(str, pair_term_1, t, 3)):
      return True

  # Step 3: nearby terms (other than "climate")
  if (near(str,"reduc\w*","disaster\w*",3) & near(str,"disaster\w*","risk\w*",3)):
      return True
  if (near(str,"resilien\w*","(climat\w*|natural disaster\w*)",3)):
      return True

  # Step 4: AND statements
  if (near(str,"Paris","(agreement|COP21)",3) & (re.search("climate",str,re.IGNORECASE) != None)):
      return True
  if (near(str,"Kyoto","protocol",3) & (re.search("climate",str,re.IGNORECASE) != None)):
      return True

  if ((re.search("sea level\w*",str,re.IGNORECASE) != None) & ((re.search("chang\w*",str,re.IGNORECASE) != None) | (re.search("rising",str,re.IGNORECASE) != None))):
      return True

  # Step 5: More complex statements
  if (near(str,"disaster\w*","(missing person\w*|human loss\w*)",3) | (near(str,"disaster\w*","number",3) & (near(str,"number","(death\w*|people)",3)))):
      return True

  if ((re.search("climate",str,re.IGNORECASE) != None) & near(str,"(assist\w*|support\w*|aid|program\w*|development\w*|capacity\w*)","(develop\w* countr\w*|least developed countr\w*|small island\w*)",3)):
      return True

  return False



In [6]:
# Read in dataset
df = pd.read_csv('uk_gtr_2015_2019.csv', dtype=str)

# Shuffle dataset
df = df.sample(frac=1).reset_index(drop=True)

# Apply search queries
selected = df[df['abstract'].apply(apply_queries) == True]
selected.to_csv('selected_projects.csv')

not_selected = df[df['abstract'].apply(apply_queries) == False]
not_selected.to_csv('not_selected_projects.csv')
