Import Libraries

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
import string
string.punctuation

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

Preprocessing

In [2]:
# read data

data = pd.read_excel('isco.xlsx')
data = data[data["ISCO 08 Code"].apply(lambda x: len(str(x)) > 3)]
data = data[["Title EN", "Tasks include"]]
data.head()

Unnamed: 0,Title EN,Tasks include
3,Legislators,Tasks include -\n(a) presiding over or partic...
4,Senior Government Officials,"Tasks include -\n(a) advising national, state..."
5,Traditional Chiefs and Heads of Villages,Tasks include -\n(a) allocating the use of co...
6,Senior Officials of Special-interest Organizat...,Tasks include -\n(a) determining and formulat...
8,Managing Directors and Chief Executives,"Tasks include -\n(a) planning, directing and ..."


In [3]:
# remove punctuation

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

data['clean_msg'] = data['Tasks include'].apply(lambda x:remove_punctuation(str(x)))
data['clean_msg'].head()

3    Tasks include \na  presiding over or participa...
4    Tasks include \na  advising national state reg...
5    Tasks include \na  allocating the use of commu...
6    Tasks include \na  determining and formulating...
8    Tasks include \na  planning directing and coor...
Name: clean_msg, dtype: object

In [4]:
# lowering text

data['msg_lower']= data['clean_msg'].apply(lambda x: x.lower())
data['msg_lower'].head()

3    tasks include \na  presiding over or participa...
4    tasks include \na  advising national state reg...
5    tasks include \na  allocating the use of commu...
6    tasks include \na  determining and formulating...
8    tasks include \na  planning directing and coor...
Name: msg_lower, dtype: object

In [5]:
# tokenization

data['msg_tokenized'] = data['msg_lower'].apply(nltk.word_tokenize)
data['msg_tokenized'].head

<bound method NDFrame.head of 3      [tasks, include, a, presiding, over, or, parti...
4      [tasks, include, a, advising, national, state,...
5      [tasks, include, a, allocating, the, use, of, ...
6      [tasks, include, a, determining, and, formulat...
8      [tasks, include, a, planning, directing, and, ...
                             ...                        
604    [tasks, include, a, delivering, messages, pack...
605    [tasks, include, a, repairing, broken, windows...
606    [tasks, include, a, filling, storage, areas, o...
607    [tasks, include, a, cutting, and, collecting, ...
608    [in, such, cases, tasks, would, include, a, se...
Name: msg_tokenized, Length: 433, dtype: object>

In [6]:
# remove stopwords

stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

data['no_stopwords']= data['msg_tokenized'].apply(lambda x:remove_stopwords(x))
data['no_stopwords'].head

<bound method NDFrame.head of 3      [tasks, include, presiding, participating, pro...
4      [tasks, include, advising, national, state, re...
5      [tasks, include, allocating, use, communal, la...
6      [tasks, include, determining, formulating, pol...
8      [tasks, include, planning, directing, coordina...
                             ...                        
604    [tasks, include, delivering, messages, package...
605    [tasks, include, repairing, broken, windows, s...
606    [tasks, include, filling, storage, areas, vend...
607    [tasks, include, cutting, collecting, wood, fo...
608    [cases, tasks, would, include, selling, admiss...
Name: no_stopwords, Length: 433, dtype: object>

POS Tag

In [7]:
data['tagged'] = data['no_stopwords'].apply(nltk.pos_tag)
data['tagged'].head()

3    [(tasks, NNS), (include, VBP), (presiding, VBG...
4    [(tasks, NNS), (include, VBP), (advising, VBG)...
5    [(tasks, NNS), (include, VBP), (allocating, VB...
6    [(tasks, NNS), (include, VBP), (determining, V...
8    [(tasks, NNS), (include, VBP), (planning, VBG)...
Name: tagged, dtype: object

POS Chunk

In [8]:
# lemmatization

lemmatizer = WordNetLemmatizer()

def lemmatize_word(word, tag):
    if tag.startswith('V'):
        return lemmatizer.lemmatize(word, 'v')  # Verb
    elif tag.startswith('N'):
        return lemmatizer.lemmatize(word, 'n')  # Noun
    else:
        return word

In [9]:
# pos chunk

def filter_verb_noun_pairs(tagged_sentence):
    verb_noun_pairs = []
    for i in range(len(tagged_sentence) - 1):
        word, tag = tagged_sentence[i]
        next_word, next_tag = tagged_sentence[i + 1]
        if tag.startswith('VB') and next_tag.startswith('NN'):
            verb = lemmatize_word(word, tag)
            noun = lemmatize_word(next_word, next_tag)
            verb_noun_pairs.append((verb, noun))
    return verb_noun_pairs

data['verb_noun_pairs'] = data['tagged'].apply(filter_verb_noun_pairs)
data['verb_noun_pairs'].head()

3    [(participate, proceeding), (administrative, c...
4    [(advise, preparation), (include, amendment), ...
5    [(allocate, use), (settle, dispute), (discipli...
6    [(formulate, policy), (function, organization)...
8    [(function, enterprise), (review, operation), ...
Name: verb_noun_pairs, dtype: object

In [12]:
# hasile

isco_task = data[["Title EN", "Tasks include", "verb_noun_pairs"]]
isco_task.to_excel('isco_task.xlsx', index=False)
isco_task

Unnamed: 0,Title EN,Tasks include,verb_noun_pairs
3,Legislators,Tasks include -\n(a) presiding over or partic...,"[(participate, proceeding), (administrative, c..."
4,Senior Government Officials,"Tasks include -\n(a) advising national, state...","[(advise, preparation), (include, amendment), ..."
5,Traditional Chiefs and Heads of Villages,Tasks include -\n(a) allocating the use of co...,"[(allocate, use), (settle, dispute), (discipli..."
6,Senior Officials of Special-interest Organizat...,Tasks include -\n(a) determining and formulat...,"[(formulate, policy), (function, organization)..."
8,Managing Directors and Chief Executives,"Tasks include -\n(a) planning, directing and ...","[(function, enterprise), (review, operation), ..."
...,...,...,...
604,"Messengers, Package Deliverers and Luggage Por...","Tasks include -\n(a) delivering messages, pac...","[(deliver, message), (shop, household), (mark,..."
605,Odd Job Persons,Tasks include -\n(a) repairing broken windows...,"[(light, bulb), (wall, ceiling), (adjust, door..."
606,Meter Readers and Vending-machine Collectors,Tasks include -\n(a) filling storage areas of...,"[(fill, storage), (vend, machine), (collect, m..."
607,Water and Firewood Collectors,Tasks include -\n(a) cutting and collecting w...,"[(collect, wood), (visit, forest), (dry, wood)..."
