In [None]:
# !pip3 install -r requirements.txt

In [2]:
from ngram_matcher import NgramMatcher

import numpy as np
import json

## Loading the Field of Study Data


> fos_ids
```
array(['75678561', '2910500712', '2778057007', ..., '2781123569',
       '3017735110', '134605974'], dtype=object)
```

> fos_names
```
array(['0 10 v lighting control', '0 degree elevation', '0 finance', ...,
       'ω logic', 'ϵ caprolactone', '−2 3 7 pretzel knot'], dtype=object)
```

With the fos_names, we intantiate a NGramMatcher with the minimum window size of 1 and maximum of 4. This is later used when there is need to figure out the relevant terms that the text corresponds to.

In [3]:
fos_ids = np.load('fos_ids.npy', allow_pickle=True)
fos_names = np.load('fos_names.npy', allow_pickle=True)

ngram_matcher = NgramMatcher(fos_names,
                             lowercase=True,
                             token_pattern=r'(?u)\b\w+\b',
                             ngram_size=(1, 4))

## Extracting Field of study from text

For a given text, the NGramMatcher matches its all possible grams with the field of studies that it was loaded with. It also maintains the frequency of each field of study. This aids in calculating the relevancy score.

Matching is done by iterating manually with all the fos and the ngrams.

>Example :
```
{'162144332': 1,
 '166052673': 1,
 '3018052625': 1,
 '44648626': 1,
 '2987947686': 1,
 '139496715': 7,
 '2991745707': 2,
 '124952713': 1,
 '37037264': 1,
 '107107730': 1,
 '199310239': 1,
 '131390527': 1,
 '192979151': 6}
 ```

In [None]:
def extract_fos(text):
    idxs, frequencies = ngram_matcher.match([text])[0]
    ngrams = sorted(zip(fos_ids[idxs], fos_names[idxs], frequencies), key=lambda ng: len(ng[1]), reverse=True)
    descored_ngrams = list()
        
    for idx, (ngram_id, ngram_name, frequency) in enumerate(ngrams):
        for _, fol_ngram_name, fol_frequency in ngrams[:idx]:
            if ngram_name in fol_ngram_name:
                frequency -= fol_frequency
        
        if frequency > 0:
            descored_ngrams.append([ngram_id, ngram_name, frequency])
        
    ngrams = descored_ngrams
    submerged_ngrams, drop_ngram_ids = list(), set()
    
    for idx, (ngram_id, ngram_name, frequency) in enumerate(ngrams):
        for ngram_id2, ngram_name2, frequency2 in ngrams[idx+1:]:
            if ngram_name2 in ngram_name:
                frequency += frequency2
                drop_ngram_ids.add(ngram_id2)
        
        submerged_ngrams.append([ngram_id, ngram_name, frequency])
    
    submerged_ngrams = list(filter(lambda ng: ng[0] not in drop_ngram_ids, submerged_ngrams))
    
    return {fos_id: frequency for fos_id, _, frequency in submerged_ngrams}

## Loading the SDG Mapping Data

Given the fos_ids and the frequency, we have to segregate it to its relevant Sustainable development goal. To map that we use OSDG's fosmap data.

In [None]:
with open('OSDG-mapping.json', 'r') as file_:
    mapping = [(sdg, set(fos_ids)) for sdg, fos_ids in json.load(file_).items()]

with open('OSDG-fosmap.json', 'r') as file_:
    fosmap = json.load(file_)

## Mapping the Labels

Now that we have the fos_ids and their frequencies found from the text, we can iterate through every sdg values and keep a count. The score for a SDG is the number of FOS for a SDG. And more the score, better it caters to it.

In [4]:
use_frequency = True
n_min_relevant_fos = 1

def tag_sdg(fos):
    sdgs = []
    fos_ids = fos.keys()
    
    for sdg, sdg_fos_ids in mapping:
        relevant_fos_ids = sdg_fos_ids.intersection(fos_ids)
        
        if relevant_fos_ids and len(relevant_fos_ids) >= n_min_relevant_fos:
            if use_frequency:
                relevance = 0
                
                for fos_id in relevant_fos_ids:
                    relevance += fos.get(fos_id)
            
            else:
                relevance = len(relevant_fos_ids)
                
            sdgs.append({'sdg': sdg,
                         'relevance': float(relevance),
                         'fos': list(map(lambda fos_id: fosmap[fos_id], relevant_fos_ids))})
    
    return sorted(sdgs, key=lambda x: x['relevance'], reverse=True)

## Extracting text from PDF

To retrieve the text from the pdf, PyPDF2 package is used. Each and every page within the given range is converted to text and concatenated.

In [5]:
import PyPDF2

pdf_file = open('tata_report.pdf','rb')
pdf_reader = PyPDF2.PdfFileReader(pdf_file)

start = 52
end = 67
text = ''

for i in range(start-1, end):
    page = pdf_reader.getPage(i)
    text += page.extractText()

In [6]:
fos = extract_fos(text)
sdgs = tag_sdg(fos)

In [7]:
sdgs

[{'sdg': 'SDG_12',
  'relevance': 47.0,
  'fos': ['Resource consumption',
   'Waste generation',
   'Circular economy',
   'Waste management',
   'Hazardous waste',
   'Reuse']},
 {'sdg': 'SDG_11',
  'relevance': 34.0,
  'fos': ['Resource efficiency',
   'Water conservation',
   'Capacity building',
   'Water withdrawal',
   'Indigenous',
   'Truck',
   'Yarn',
   'Footprint',
   'Water supply',
   'Communication']},
 {'sdg': 'SDG_3',
  'relevance': 32.0,
  'fos': ['Pandemic',
   'Globe',
   'Public health',
   'Vaccination',
   'Confined space',
   'Virus',
   'Hygiene',
   'Mental health',
   'Community health',
   'Survival rate',
   'Traumatic stress',
   'Malnutrition in children',
   'Helpline']},
 {'sdg': 'SDG_6',
  'relevance': 22.0,
  'fos': ['Rainwater harvesting',
   'Water conservation',
   'Water withdrawal',
   'Hygiene',
   'Effluent',
   'Water supply']},
 {'sdg': 'SDG_7',
  'relevance': 12.0,
  'fos': ['Electricity',
   'Zero carbon',
   'Renewable energy',
   'Energy 