<a href="https://colab.research.google.com/github/shreyawalia/ai-powered-file-management/blob/main/Model%20Creation/ml_sdg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import metrics

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
base_dir = "/content/drive/My Drive/sdg/"
labelled_dataset = base_dir + "dataset/sdg_tag.csv"
CROSS_FOLDS = f"{base_dir}dataset/cross_validation/"

In [None]:

labelled = pd.read_csv(labelled_dataset)
#labelled.labels = labelled.labels.str.split('|').apply(lambda x: [int(i) for i in x])

mlb = MultiLabelBinarizer(classes = ("goal_1", "goal_2", "goal_3", "goal_4", "goal_5", "goal_6", "goal_7", "goal_8", "goal_9", "goal_10", "goal_11", "goal_12", "goal_13", "goal_14", "goal_15", "goal_16", "goal_17"))

#create boolean mask matched non NaNs values
mask = labelled['Tag'].notnull()


In [None]:
import nltk
from nltk.stem.snowball import SnowballStemmer
import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [None]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [None]:
labelled['description'] = labelled['description'].str.lower()
labelled['description'] = labelled['description'].apply(cleanHtml)
labelled['description'] = labelled['description'].apply(cleanPunc)
labelled['description'] = labelled['description'].apply(keepAlpha)

In [None]:
labelled.head()

Unnamed: 0.1,Unnamed: 0,description,goal_1,goal_2,goal_3,goal_4,goal_5,goal_6,goal_7,goal_8,goal_9,goal_10,goal_11,goal_12,goal_13,goal_14,goal_15,goal_16,goal_17,Tag
0,0,description background in order to achieve th...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,"goal_1,goal_2,goal_3,goal_4,goal_5,goal_6,goal..."
1,1,description achievement of initiative desa wis...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,"goal_8,goal_15"
2,2,description the overall objective of this proj...,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,"goal_13,goal_14"
3,3,description the global goals jam is a two day ...,0,0,1,1,0,0,0,0,0,1,1,0,1,1,1,0,1,"goal_3,goal_4,goal_10,goal_11,goal_13,goal_14,..."
4,4,description achievement of initiative newport ...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,goal_4


In [None]:
data_x = labelled.loc[mask, 'description'].dropna().reset_index(drop = True).values
x = np.array([x for x in data_x.tolist()])
y = mlb.fit_transform(labelled.loc[mask, 'Tag'].dropna().str.strip('[]').str.split(','))

stop_words = set(stopwords.words('english'))
labels = [str(i) for i in range(1,18)]

In [None]:
data_x.tolist()[0]

'description background  in order to achieve the sustainable development goals sdgs various efforts are underway in countries around the world sdg management is an indispensable perspective for local and regional governments lrgs and companies in japan on the other hand a mechanism of collecting evaluating and visualizing information and data as a basis of sdg management has yet to be established at the local and regional levels with japanese local context thus it is an urgent task for lrgs to set up such a system of monitoring and evaluation in order to show the results of planning and implementation so far to citizens companies and other stakeholders in order to accelerate further efforts towards the sdgs in their respective cities and regions against this backdrop uncrd has launched a project to develop a monitoring and evaluation tool package in collaboration with local governments and private companies that are already committed to the sdgs in particular in chubu japan objectives 

In [None]:
splits = []
for fold in os.listdir(CROSS_FOLDS):
    train_index = np.load(f"{CROSS_FOLDS}{fold}/train.npy")
    val_index = np.load(f"{CROSS_FOLDS}{fold}/val.npy")
    splits.append((train_index, val_index))

In [None]:
print(train_index)

[ 754  620 1108 ... 1931 1558 1924]


In [None]:
def grid_search(x, y, parameters, pipeline, splits):
    '''Train pipeline, test and print results'''
    gs = GridSearchCV(pipeline, 
                      parameters, 
                      cv=splits, 
                      n_jobs=5, 
                      verbose=10, 
                      return_train_score=True, 
                      scoring='f1_micro')
    gs.fit(x, y)
    print()
    print("Best parameters set:")
    print(gs.best_estimator_.steps)
    print()
    results = gs.cv_results_
    print(f"Mean train scores: {results['mean_train_score']}")
    print(f"Mean validation scores: {results['mean_test_score']}")

In [None]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
parameters = {
                'tfidf__max_df': (0.25, 0.5, 0.75),
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
                'clf__estimator__alpha': (1e-2, 1e-3)
            }
grid_search(x, y, parameters, pipeline, splits)

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}
grid_search(x, y, parameters, pipeline, splits)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:   14.0s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:   44.4s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:  1.7min
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:  2.3min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:  3.4min
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  4.0min
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:  5.5min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:  6.7min
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:  8.1min
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:  9.7min
[Parallel(n_jobs=5)]: Done 103 tasks      | elapsed: 11.5min
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed: 13.2min
[Parallel(n_jobs=5)]: Done 135 tasks      | elapsed: 15.0min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 16.8min
[Parallel(n_jobs=5)]: Done 171 tasks      | elapsed: 18.8min
[Parallel(


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=0.1, class_weight='balanced',
  

In [None]:
data_x

array(['description background  in order to achieve the sustainable development goals sdgs various efforts are underway in countries around the world sdg management is an indispensable perspective for local and regional governments lrgs and companies in japan on the other hand a mechanism of collecting evaluating and visualizing information and data as a basis of sdg management has yet to be established at the local and regional levels with japanese local context thus it is an urgent task for lrgs to set up such a system of monitoring and evaluation in order to show the results of planning and implementation so far to citizens companies and other stakeholders in order to accelerate further efforts towards the sdgs in their respective cities and regions against this backdrop uncrd has launched a project to develop a monitoring and evaluation tool package in collaboration with local governments and private companies that are already committed to the sdgs in particular in chubu japan obje

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
    "clf__estimator__multi_class": ['ovr', 'multinomial']
}

grid_search(x, y, parameters, pipeline, splits)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:   20.1s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:  2.2min
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:  5.8min
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:  7.8min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed: 11.5min
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed: 13.4min
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed: 19.2min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed: 24.1min
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed: 31.1min
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed: 38.2min
[Parallel(n_jobs=5)]: Done 103 tasks      | elapsed: 43.7min
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed: 49.2min
[Parallel(n_jobs=5)]: Done 135 tasks      | elapsed: 54.9min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 62.1min
[Parallel(n_jobs=5)]: Done 171 tasks      | elapsed: 71.3min
[Parallel(


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight='balanc

In [None]:
xp = 'Rangita de Silva, Associate Dean of the University of Pennsylvania, focused on SDG 5, and particularly target 5.1., which aims at ending all forms gender discrimination. She argued for dismantling de jure gender discrimination in the Law. She recalled that every single country has at least 4 laws that discriminate, directly or indirectly, against women and remembered that gender discrimination, has a very high cost, 28 trillion dollars per year, according to McKinsey. For example, she reminded that when Ethiopia eliminated the requisite of men to authorize their wives to look for employment, womenâ€™s participation in the job market increased substantially and GDP grew by 17%. This is particularly relevant as achieving SDGs has been estimated may need an investment of trillions of dollars per year.'

In [None]:
tfidf = gs.best_estimator_.named_steps['tfidf']
clf = gs.best_estimator_.named_steps['clf']

In [None]:
xp = xp.lower()
xp = cleanHtml(xp)
xp = cleanPunc(xp)
xp = keepAlpha(xp)
xt = tfidf.transform([xp])
clf.predict(xt)

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', DecisionTreeClassifier()),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
}
grid_search(x, y, parameters, pipeline, splits)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:   26.9s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:  2.6min
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:  7.6min
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:  9.8min
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed: 14.6min
[Parallel(n_jobs=5)]: Done  41 out of  45 | elapsed: 21.1min remaining:  2.1min
[Parallel(n_jobs=5)]: Done  45 out of  45 | elapsed: 21.3min finished



Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.75, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)), ('clf', DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini'