### 6. Extract syntactic for english dataset

In [5]:
# setup only
#nltk.download('wordnet')
#nltk.download('omw')

In [7]:
# libraries
import pandas as pd
import nltk
from nltk import pos_tag, word_tokenize
import numpy as np
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.tag import NgramTagger, SequentialBackoffTagger
from nltk.corpus import wordnet, names
from nltk.probability import FreqDist

In [8]:
# load data
df = pd.read_csv('../datasets/english1970_data.csv', header= 0, encoding="utf-8")
df.shape

(1970, 2)

In [9]:
# column label
df.columns = ['comment','sarc_majority']

In [10]:
# tokenize
from nltk.tokenize import TreebankWordTokenizer
tokens= TreebankWordTokenizer()
df['comment']= [tokens.tokenize(s) for s in df.comment]

In [11]:
df.comment

0                 [more, people, will, bear, it, appears]
1       [have, you, greed, of, money, people, want, to...
2       [radio, television, lid, do, not, hear, the, p...
3       [yes, you, 're, whipping, people, is, a, great...
4       [ooi, government, national, front, you, people...
5       [national, debt, increased, again, taken, by, ...
6       [since, independence, am, the, worst, is, if, ...
7       [home, near, the, small, meniaga, who, ha, a, ...
8                                        [home, is, very]
9       [if, you, want, a, good, return, is, not, grea...
10                                         [you, believe]
11      [I, greet, the, people, of, the, same, or, kah...
12                               [return, to, pocket, RM]
13         [haha, to, take, the, rights, of, the, people]
14                          [congratulations, thank, God]
15      [Najib, barua, you, want, people, calling, you...
16                                                [winch]
17      [I, be

In [12]:
# wordnet tagger
class WordNetTagger(SequentialBackoffTagger):
    '''
    >>> wt = WordNetTagger()
    >>> wt.tag(['food', 'is', 'great'])
    [('food', 'NN'), ('is', 'VB'), ('great', 'JJ')]
    '''
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)
        
        self.wordnet_tag_map = {
            'n': 'NN', #Noun
            's': 'JJ', #Sat_Adjective
            'a': 'JJ', #Adjective
            'r': 'RB', #Adverb
            'v': 'VB'  #Verb
        }
        
    def choose_tag(self, tokens, index, history):
        word = tokens[index]
        fd = FreqDist()
        
        for synset in wordnet.synsets(word):
            fd[synset.pos()] += 1
        
        if not fd: return None
        return self.wordnet_tag_map.get(fd.max())

In [13]:
wn_tagger = WordNetTagger()

In [14]:
# tag with wordnet tagger
df['comment'] = [wn_tagger.tag(word) for word in df.comment]

In [15]:
df.comment

0       [(more, JJ), (people, NN), (will, NN), (bear, ...
1       [(have, VB), (you, None), (greed, NN), (of, No...
2       [(radio, NN), (television, NN), (lid, NN), (do...
3       [(yes, NN), (you, None), ('re, None), (whippin...
4       [(ooi, None), (government, NN), (national, JJ)...
5       [(national, JJ), (debt, NN), (increased, VB), ...
6       [(since, None), (independence, NN), (am, VB), ...
7       [(home, NN), (near, JJ), (the, None), (small, ...
8                      [(home, NN), (is, VB), (very, JJ)]
9       [(if, None), (you, None), (want, VB), (a, NN),...
10                           [(you, None), (believe, VB)]
11      [(I, NN), (greet, VB), (the, None), (people, N...
12      [(return, VB), (to, None), (pocket, NN), (RM, ...
13      [(haha, None), (to, None), (take, VB), (the, N...
14        [(congratulations, NN), (thank, VB), (God, NN)]
15      [(Najib, None), (barua, None), (you, None), (w...
16                                          [(winch, NN)]
17      [(I, N

In [16]:
# select only tagged token
df.comment = df.comment.apply(lambda x: [(t[0],) for t in x if t[1]=='NN' or t[1]=='JJ' or t[1]=='RB' or t[1]=='VB'])
df.loc[df.comment.apply(len) == 0, 'comment'] = [[np.nan]]
print (df)

                                                comment sarc_majority
0     [(more,), (people,), (will,), (bear,), (it,), ...          sarc
1     [(have,), (greed,), (money,), (people,), (want...      non-sarc
2     [(radio,), (television,), (lid,), (do,), (not,...          sarc
3     [(yes,), (whipping,), (people,), (is,), (a,), ...      non-sarc
4     [(government,), (national,), (front,), (people...      non-sarc
5     [(national,), (debt,), (increased,), (again,),...          sarc
6     [(independence,), (am,), (worst,), (is,), (wan...      non-sarc
7     [(home,), (near,), (small,), (who,), (ha,), (a...          sarc
8                             [(home,), (is,), (very,)]          sarc
9     [(want,), (a,), (good,), (return,), (is,), (no...          sarc
10                                         [(believe,)]          sarc
11    [(I,), (greet,), (people,), (same,), (or,), (o...          sarc
12                               [(return,), (pocket,)]          sarc
13                  

In [17]:
# remove re special character
df.comment = df.comment.astype(str).str.replace(r"[^a-zA-Z ]+", " ").str.split(expand=False)

In [18]:
print (df)

                                                comment sarc_majority
0               [more, people, will, bear, it, appears]          sarc
1     [have, greed, money, people, want, do, big, sp...      non-sarc
2     [radio, television, lid, do, not, hear, promis...          sarc
3     [yes, whipping, people, is, a, great, budget, ...      non-sarc
4     [government, national, front, people, aware, n...      non-sarc
5     [national, debt, increased, again, taken, by, ...          sarc
6     [independence, am, worst, is, want, compare, p...      non-sarc
7     [home, near, small, who, ha, a, lot, money, ou...          sarc
8                                      [home, is, very]          sarc
9     [want, a, good, return, is, not, great, quotes...          sarc
10                                            [believe]          sarc
11           [I, greet, people, same, or, own, account]          sarc
12                                     [return, pocket]          sarc
13                  

In [19]:
# save to 
df.to_csv('../datasets/english1970_syntc.csv', index=False)