In [66]:
import pandas as pd

In [67]:
from bs4 import BeautifulSoup

In [68]:
import requests
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
import nltk
from nltk.stem import PorterStemmer

## method

1. lematize texts
2. tfidf on targets to get word weights w.
3. filter to get w > threshold
4. word to vect for texts
5. get sum_i (sin(vect_snap, vect_target_i) * w_i) for every snap word to get match score
6. filter match score > threshold 
7. get probabilities using sum of match_scores for a snap for each targets
8. display matching scores

In [69]:
goals = [12, 15, 16]
url = "https://sdgs.un.org/goals/goal{}"
print(url.format(goals[0]))

https://sdgs.un.org/goals/goal12


In [70]:
for goal in goals:
    r = requests.get(url.format(goal))
    if r.ok:
        path = Path(r"E:\users\ppx2\perso\contests\sustainable-targets\data\raw\\" + \
            f"goal{goal}.html")
        if not path.exists:
            with open(str(path), "wb") as f:
                f.write(r.content)

In [71]:
with open(str(Path(r"E:\users\ppx2\perso\contests\sustainable-targets\data\raw\\" +
                   f"goal{12}.html")), "rb") as fp:
    soup = BeautifulSoup(fp, 'html.parser')



In [72]:
def parse_html(soup: BeautifulSoup) -> dict[str: list]:
    goals_data = {'title': [], 'description': [], 'indicators': []}
    targets = soup.find('div', {'id': "targets_and_indicators"})
    cards = targets.next_element.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling

    for card in cards:
        if card != '\n':
            # print(card)
            goals_data['title'].append(card.find('div', {'class': 'field'}).string)
            goals_data['description'].append(card.find('p').string)
            indicators = card.find('div', {'aria-labelledby': 'headingOne'})
            
            goals_data['indicators'].append(
                []
            )
            for indicator in indicators:
                if indicator != '\n':
                    goals_data['indicators'][-1].append(
                        indicator.find('div', {'class': 'field--name-description'}).string)
    return goals_data


parse_html(soup)


{'title': ['12.1',
  '12.2',
  '12.3',
  '12.4',
  '12.5',
  '12.6',
  '12.7',
  '12.8',
  '12.a',
  '12.b',
  '12.c'],
 'description': ['Implement the 10-year framework of programmes on sustainable consumption and production, all countries taking action, with developed countries taking the lead, taking into account the development and capabilities of developing countries',
  'By 2030, achieve the sustainable management and efficient use of natural resources',
  'By 2030, halve per capita global food waste at the retail and consumer levels and reduce food losses along production and supply chains, including post-harvest losses',
  'By 2020, achieve the environmentally sound management of chemicals and all wastes throughout their life cycle, in accordance with agreed international frameworks, and significantly reduce their release to air, water and soil in order to minimize their adverse impacts on human health and the environment',
  'By 2030, substantially reduce waste generation thro

In [73]:
frames=  []
for goal in goals:
    with open(str(Path(r"E:\users\ppx2\perso\contests\sustainable-targets\data\raw\\" +
                       f"goal{goal}.html")), "rb") as fp:
        soup = BeautifulSoup(fp, 'html.parser')
        frames.append(pd.DataFrame(parse_html(soup)))
goals_df = pd.concat(frames)
goals_df

Unnamed: 0,title,description,indicators
0,12.1,Implement the 10-year framework of programmes ...,"[Number of countries developing, adopting or i..."
1,12.2,"By 2030, achieve the sustainable management an...","[Material footprint, material footprint per ca..."
2,12.3,"By 2030, halve per capita global food waste at...",[(a) Food loss index and (b) food waste index]
3,12.4,"By 2020, achieve the environmentally sound man...",[Number of parties to international multilater...
4,12.5,"By 2030, substantially reduce waste generation...","[National recycling rate, tons of material rec..."
5,12.6,"Encourage companies, especially large and tran...",[Number of companies publishing sustainability...
6,12.7,Promote public procurement practices that are ...,[Degree of sustainable public procurement poli...
7,12.8,"By 2030, ensure that people everywhere have th...",[Extent to which (i) global citizenship educat...
8,12.a,Support developing countries to strengthen the...,[Installed renewable energy-generating capacit...
9,12.b,Develop and implement tools to monitor sustain...,[Implementation of standard accounting tools t...


In [74]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from collections import defaultdict
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [99]:

lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
ps = PorterStemmer()
stems_word = {}

def lemmatize_text(text: str) -> list[str]:
    stems = {ps.stem(w) for w in w_tokenizer.tokenize(text)}
    tokens = word_tokenize(text)
    lemma_function = WordNetLemmatizer()
    lemmas = []
    for token, tag in pos_tag(tokens):
        lemma = lemma_function.lemmatize(token, tag_map[tag[0]])
        lemmas.append(lemma)
    lemmas_set = set(lemmas)
    updated_lemmas = []
    for lemma in lemmas:
        word = stems_word.setdefault(ps.stem(lemma), lemma)
        updated_lemmas.append(word)
    # stemmables = {lemma for lemma in lemmas_set if ps.stem(lemma) in lemmas_set}
    # updated_lemmas = [ps.stem(lemma) if lemma in stemmables else lemma for lemma in lemmas]
    return updated_lemmas


goals_df['description_lemmatized'] = goals_df['description'].apply(lemmatize_text)
goals_df.head()


Unnamed: 0,title,description,indicators,description_lemmatized
0,12.1,Implement the 10-year framework of programmes ...,"[Number of countries developing, adopting or i...","[Implement, the, 10-year, framework, of, progr..."
1,12.2,"By 2030, achieve the sustainable management an...","[Material footprint, material footprint per ca...","[By, 2030, ,, achieve, the, sustainable, manag..."
2,12.3,"By 2030, halve per capita global food waste at...",[(a) Food loss index and (b) food waste index],"[By, 2030, ,, halve, per, caput, global, food,..."
3,12.4,"By 2020, achieve the environmentally sound man...",[Number of parties to international multilater...,"[By, 2020, ,, achieve, the, environmentally, s..."
4,12.5,"By 2030, substantially reduce waste generation...","[National recycling rate, tons of material rec...","[By, 2030, ,, substantially, reduce, waste, ge..."


In [100]:
sentences = goals_df.description_lemmatized.str.join(' ')
sentences[:5]


0    Implement the 10-year framework of programme o...
1    By 2030 , achieve the sustainable management a...
2    By 2030 , halve per caput global food waste at...
3    By 2020 , achieve the environmentally sound ma...
4    By 2030 , substantially reduce waste generatio...
Name: description_lemmatized, dtype: object

In [101]:
ps.stem('wasteful')

'wast'

In [104]:

tfidf= TfidfVectorizer(ngram_range=(1,1), max_features=200,  
       stop_words=ENGLISH_STOP_WORDS).fit(sentences)
tfidf_tweet = tfidf.transform(sentences)

result=pd.DataFrame(tfidf_tweet.toarray(), columns=tfidf.get_feature_names())
result.head()

Unnamed: 0,10,2020,2030,abuse,access,accordance,account,achieve,action,address,...,type,urgent,use,utilization,value,violence,waste,water,wetland,wildlife
0,0.269971,0.0,0.0,0.0,0.0,0.0,0.206386,0.0,0.221871,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.279863,0.0,0.0,0.0,0.0,0.374968,0.0,0.0,...,0.0,0.0,0.374968,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.152002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.189443,0.0,0.0,0.0
3,0.0,0.169256,0.0,0.0,0.0,0.190848,0.0,0.205167,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.190848,0.223628,0.0,0.0
4,0.0,0.0,0.26398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.329003,0.0,0.0,0.0


In [103]:
len(sentences)

35