### 3. Extract punctuation marks for malay bilingual dataset

In [106]:
# libraries
import nltk
import pandas as pd
import numpy as np
import re
import csv

In [107]:
# set max each column
pd.set_option('display.max_colwidth', 80)

In [108]:
# load data
df = pd.read_csv('../datasets/mly1970_sarc(raw).csv', header= 0, encoding="utf-8")
df.shape

(1970, 2)

In [109]:
# column label
df.columns = ['comment','sarc_majority']

In [110]:
# tokenizer
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

In [111]:
# apply tokenize
df['comment'] = df['comment'].apply(lambda t: tknzr.tokenize(t))
df.head(20)

Unnamed: 0,comment,sarc_majority
0,"[Rakyat, akan, tanggung, lebih, banyak, la, nampaknya]",sarc
1,"[Dah, ko, metedarah, duit, Rakyat, ,, nak, buat, belanja, besar, la, Lagi, ,...",non-sarc
2,"[Tutup, tv, n, radio, ..., Jgn, dngar, janji, bajet, 2016, Dr, tangan, ghaib...",sarc
3,"[Ya, lah, Ko, dah, dera, Rakyat, .., Bajet, besar, untuk, Rakyat, .., Bosan,...",non-sarc
4,"[Oiii, ..., kerjaan, bn, .., ko, org, sdr, x, ., ?, Dh, lh, kerajaan, xde, d...",non-sarc
5,"[Bertambah, lg, hutang, Negara, ..., diambil, oleh, tangan, ghaib, ...]",sarc
6,"[Sejak, negara, merdeka, Pm, inilah, yg, paling, teruk, kalau, nak, dibandin...",non-sarc
7,"[Pulang, kt, sape, ha, ?, ?, yg, meniaga, kcil, 2, bnyk, klua, duit, utk, gs...",sarc
8,"[Pulang, la, sangattt]",sarc
9,"[Kalau, nk, pulang, .., baik, x, yah, kutip, jer, dari, dulu, ..., Masa, kut...",sarc


In [112]:
# selected re punctuation only
df.comment = df.comment.apply(lambda x: [t for t in x if re.match(r'([”?!\'])', t)] or [np.nan])

In [113]:
df

Unnamed: 0,comment,sarc_majority
0,[nan],sarc
1,[nan],non-sarc
2,[nan],sarc
3,[nan],non-sarc
4,[?],non-sarc
5,[nan],sarc
6,[nan],non-sarc
7,"[?, ?]",sarc
8,[nan],sarc
9,[nan],sarc


In [114]:
# set re punctuation to group
replace_df = pd.DataFrame({'replacer': {0: 'punct1', 1: 'punct2', 2: 'punct3'}, 'punct': {0: '?', 1: '!', 2: "'"}})
print (replace_df)

  replacer punct
0   punct1     ?
1   punct2     !
2   punct3     '


In [115]:
# replace index re punctuation with group
replace_df.punct = '\\' + replace_df.punct.apply(lambda x: x.format())
d = replace_df.set_index('punct')['replacer'].to_dict()
print (d)

{'\\?': 'punct1', '\\!': 'punct2', "\\'": 'punct3'}


In [116]:
# apply puncttuation group to label
df.comment = df.comment.apply(lambda x: pd.Series(x).astype(str).replace(d, regex=True).tolist())
df.loc[df.comment.apply(len) == 0, 'comment'] = [np.nan]
df

Unnamed: 0,comment,sarc_majority
0,[nan],sarc
1,[nan],non-sarc
2,[nan],sarc
3,[nan],non-sarc
4,[punct1],non-sarc
5,[nan],sarc
6,[nan],non-sarc
7,"[punct1, punct1]",sarc
8,[nan],sarc
9,[nan],sarc


In [117]:
# save to 
df.to_csv('../datasets/mly1970_punc.csv', index=False)