# Feature Extraction

## Import Packages and Dataset

In [39]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_selection import mutual_info_classif

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_train_full = pd.read_csv("../Data/data_train_preprocessed.csv",sep=";")
data_test_full = pd.read_csv("../Data/data_test_preprocessed.csv",sep=";")

data_train_nostopword = pd.read_csv("../Data/data_train_nostopword_preprocessed.csv",sep=";")
data_test_nostopword = pd.read_csv("../Data/data_test_nostopword_preprocessed.csv",sep=";")

data_train_nostemstop = pd.read_csv("../Data/data_train_nostemstop_preprocessed.csv",sep=";")
data_test_nostemstop = pd.read_csv("../Data/data_test_nostemstop_preprocessed.csv",sep=";")

In [3]:
data_train_trans = pd.read_csv("../Data/data_train_trans_preprocessed.csv",sep=";")
data_test_trans = pd.read_csv("../Data/data_test_trans_preprocessed.csv",sep=";")

## Feature Selection (Mutual Information)

In [4]:
X_train_full = data_train_full['Tweet_Parsed']
X_test_full = data_test_full['Tweet_Parsed']
y_train_full = data_train_full.drop(['Tweet','Tweet_Parsed'],axis=1)
y_test_full = data_test_full.drop(['Tweet','Tweet_Parsed'],axis=1)

X_train_nostopword = data_train_nostopword['Tweet_Parsed']
X_test_nostopword = data_test_nostopword['Tweet_Parsed']
y_train_nostopword = data_train_nostopword.drop(['Tweet','Tweet_Parsed'],axis=1)
y_test_nostopword = data_test_nostopword.drop(['Tweet','Tweet_Parsed'],axis=1)

X_train_nostemstop = data_train_nostemstop['Tweet_Parsed']
X_test_nostemstop = data_test_nostemstop['Tweet_Parsed']
y_train_nostemstop = data_train_nostemstop.drop(['Tweet','Tweet_Parsed'],axis=1)
y_test_nostemstop = data_test_nostemstop.drop(['Tweet','Tweet_Parsed'],axis=1)

In [40]:
def feature_selection(X_train_, y_train_, treshold = 0):
    tf = CountVectorizer()
    features_train = tf.fit_transform(X_train_).toarray()
    features_name = tf.get_feature_names()
    target = y_train_.columns
    features = []
    for label in tqdm(target):
        print(label)
        MI = mutual_info_classif(features_train, y_train_[label])
        features_mi = [(i,j) for i,j in zip(features_name,MI)]
        features_taken = [i[0] for i in features_mi if i[1]>treshold]
        features.append(features_taken)
    features = np.concatenate(features)
    features = np.unique(features)
    return features

In [41]:
features_train_full = feature_selection(X_train_full,y_train_full,0.0075)

  0%|                                                                                           | 0/12 [00:00<?, ?it/s]

HS


  8%|██████▋                                                                         | 1/12 [15:55<2:55:15, 955.95s/it]

Abusive


 17%|█████████████▎                                                                  | 2/12 [32:07<2:40:05, 960.56s/it]

HS_Individual


 25%|████████████████████                                                            | 3/12 [48:08<2:24:05, 960.62s/it]

HS_Group


 33%|██████████████████████████                                                    | 4/12 [1:04:24<2:08:43, 965.41s/it]

HS_Religion


 42%|████████████████████████████████▌                                             | 5/12 [1:20:32<1:52:43, 966.23s/it]

HS_Race


 50%|███████████████████████████████████████                                       | 6/12 [1:36:36<1:36:33, 965.50s/it]

HS_Physical


 58%|█████████████████████████████████████████████▌                                | 7/12 [1:52:39<1:20:24, 964.84s/it]

HS_Gender


 67%|████████████████████████████████████████████████████                          | 8/12 [2:08:43<1:04:17, 964.42s/it]

HS_Other


 75%|████████████████████████████████████████████████████████████                    | 9/12 [2:24:32<48:00, 960.01s/it]

HS_Weak


 83%|█████████████████████████████████████████████████████████████████▊             | 10/12 [2:40:23<31:54, 957.05s/it]

HS_Moderate


 92%|████████████████████████████████████████████████████████████████████████▍      | 11/12 [2:56:23<15:57, 957.89s/it]

HS_Strong


100%|███████████████████████████████████████████████████████████████████████████████| 12/12 [3:12:26<00:00, 959.48s/it]


In [43]:
len(features_train_full.tolist())

2152

## TF-IDF

In [52]:
def tfidf(X_train_,X_test_,y_train_,y_test_,fitur,target):
    ngram_range = (1,1)
    min_df = 1
    max_df = 1.
    tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=None,
                        norm='l2')
    features_train = tfidf.fit_transform(X_train_).toarray()
    labels_train = y_train_.values
    features_test = tfidf.transform(X_test_).toarray()
    labels_test = y_test_.values
    features_all = tfidf.get_feature_names()
    data_train_tfidf = pd.DataFrame(data=features_train,columns=features_all)
    data_train_tfidf = data_train_tfidf[fitur]
    data_train_tfidf = data_train_tfidf.join(pd.DataFrame(data=labels_train,columns=target))
    data_test_tfidf = pd.DataFrame(data=features_test,columns=features_all)
    data_test_tfidf = data_test_tfidf[fitur]
    data_test_tfidf = data_test_tfidf.join(pd.DataFrame(data=labels_test,columns=target))
    return data_train_tfidf,data_test_tfidf

In [53]:
target = data_train_full.drop(['Tweet','Tweet_Parsed'],axis = 1).columns

### Data Without Translation

In [55]:
data_train_full_tfidf,data_test_full_tfidf = tfidf(X_train_full,X_test_full,y_train_full,y_test_full,features_train_full,target)

In [59]:
data_train_full_tfidf.shape

(11193, 2164)

In [60]:
data_test_full_tfidf.shape

(1976, 2164)

### Data With Translation

In [145]:
X_train_trans = data_train_trans['Tweet_Parsed']
X_test_trans = data_test_trans['Tweet_Parsed']
y_train_trans = data_train_trans.drop(['Tweet','Tweet_Parsed'],axis=1)
y_test_trans = data_test_trans.drop(['Tweet','Tweet_Parsed'],axis=1)

In [146]:
# Parameter election
ngram_range = (1,1)
min_df = 1
max_df = 1.
max_features = None

In [147]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2')
#                         sublinear_tf=True)

In [148]:
features_train_trans = tfidf.fit_transform(X_train_trans).toarray()
labels_train_trans = y_train_trans
print(features_train_trans.shape)

(11193, 10666)


In [149]:
features_test_trans = tfidf.transform(X_test_trans).toarray()
labels_test_trans = y_test_trans
print(features_test_trans.shape)

(1976, 10666)


In [150]:
fitur_trans = tfidf.get_feature_names()
target_trans = data_train_trans.drop(['Tweet','Tweet_Parsed'],axis = 1).columns

In [151]:
data_train_trans_tfidf = pd.DataFrame(data=features_train_trans,columns=fitur_trans)
data_train_trans_tfidf = data_train_trans_tfidf.join(pd.DataFrame(data=y_train_trans,columns=target_trans))
data_train_trans_tfidf.head()

Unnamed: 0,10,aaa,aaaa,aaaaaaa,aaaaaaaaaah,aaaah,aaaakuuu,aaaamiiiiiiinnnn,aaah,aaamiinn,...,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,1,0,0


In [152]:
data_test_trans_tfidf = pd.DataFrame(data=features_test_trans,columns=fitur_trans)
data_test_trans_tfidf = data_test_trans_tfidf.join(pd.DataFrame(data=y_test_trans,columns=target_trans))
data_test_trans_tfidf.head()

Unnamed: 0,10,aaa,aaaa,aaaaaaa,aaaaaaaaaah,aaaah,aaaakuuu,aaaamiiiiiiinnnn,aaah,aaamiinn,...,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Out of Vocabulary

In [153]:
def generate_ngrams(s):
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.str.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = np.concatenate(tokens)
    return ngrams

### Data With Translation

In [154]:
tokensTest = generate_ngrams(data_test['Tweet_Parsed']).tolist()
# tokensTrain = np.unique(tokensTrain).tolist()
tokensTest

['ngewe',
 'ngewe',
 'hadiah',
 'indonesia',
 'prediksi',
 'negara',
 'ekonomi',
 'dunia',
 'era',
 'milik',
 'generasi',
 'muda',
 'kesiap',
 'amp',
 'bekal',
 'kompetensi',
 'kuat',
 'generasi',
 'muda',
 'pasuk',
 'era',
 'ekonomi',
 'digital',
 'revolusi',
 'industri',
 'ge',
 'berita',
 'ppp',
 'tolak',
 'calon',
 'presiden',
 'jokowi',
 'dukung',
 'partai',
 'demokrasi',
 'indonesia',
 'juang',
 'ppp',
 'tolak',
 'jokowi',
 'presiden',
 'periode',
 'partai',
 'bangun',
 'ppp',
 'kubu',
 'main',
 'disc',
 'jockey',
 'fariz',
 'tolak',
 'jokowi',
 'pimpin',
 'indonesia',
 'periode',
 'residen',
 'country',
 'director',
 'tumbuh',
 'ekonomi',
 'lestari',
 'lingkung',
 'tumbuh',
 'bantu',
 'perintah',
 'indonesia',
 'lestari',
 'sumber',
 'daya',
 'alam',
 'emisi',
 'rumah',
 'kaca',
 'umat',
 'hindu',
 'selamat',
 'ulang',
 'presiden',
 'soeharto',
 '',
 '',
 'uh',
 'ena',
 'susilo',
 'bambang',
 'yudhoyono',
 'korupsi',
 'kelulus',
 'coret',
 'coret',
 'baju',
 'cupu',
 'bijak',
 '

In [155]:
outOfVocab = []
for i in tokensTest:
    if i not in fitur:
        outOfVocab.append(i)
outOfVocab

['director',
 'emisi',
 '',
 '',
 'kelulus',
 'pilihjokowi',
 'bmw',
 'arief',
 'korg',
 'saviour',
 'korg',
 'chapter',
 'sunatullah',
 'khalifatullah',
 'wooseok',
 'fatimah',
 'laman',
 'busung',
 'ode',
 'hanafi',
 'hifzil',
 'kalender',
 'athletics',
 'federations',
 'pejam',
 'eeeeeh',
 'arra',
 'florida',
 'denuklirisasi',
 'bapuk',
 'tawau',
 'panggang',
 'cashflow',
 'karisma',
 'biarpun',
 'kamprettt',
 'chibi',
 'kasak',
 'kusuk',
 'achhh',
 'ik',
 'peno',
 'inrl',
 'fana',
 'benalu',
 'riba',
 'kubus',
 'makamya',
 'sissy',
 'lepeh',
 'limbat',
 'komoditi',
 'yukkk',
 'agro',
 'eduwisata',
 'piknik',
 'animo',
 'persetan',
 'atos',
 'payudara',
 'slawi',
 'dian',
 'nyong',
 'digawe',
 'x',
 'alaaaahhhhh',
 'kelok',
 'junedi',
 'choir',
 'buntut',
 'unicorn',
 'ironis',
 'caca',
 'nuy',
 'acap',
 'acuh',
 'feronikel',
 'jengkel',
 'taha',
 'inter',
 'lazio',
 'kylaaaaaaya',
 'muuuaaak',
 'cuaca',
 'p',
 'fabel',
 'bidadari',
 'sad',
 'siiichhh',
 'nyerem',
 'cy',
 'ngiahahah

In [157]:
print("# out of vocabulary : ", len(outOfVocab))
print("# out of vocabulary (unique words) : ", len(np.unique(outOfVocab)))

# out of vocabulary :  1106
# out of vocabulary (unique words) :  954


### Data With Translation

In [158]:
tokensTestTrans = generate_ngrams(data_test_trans['Tweet_Parsed']).tolist()
# tokensTrain = np.unique(tokensTrain).tolist()
tokensTestTrans

['ngewe',
 'ngewe',
 'hadiah',
 'indonesia',
 'prediksi',
 'gatra',
 'ekonomi',
 'dunia',
 'era',
 'milik',
 'generasi',
 'muda',
 'review',
 'kesiap',
 'amp',
 'bekal',
 'kompetensi',
 'kuat',
 'generasi',
 'muda',
 'pasuk',
 'era',
 'ekonomi',
 'revolusi',
 'digital',
 'industri',
 'ge',
 'nyanyi',
 'berita',
 'ppp',
 'tolak',
 'calon',
 'presiden',
 'jokowi',
 'dukung',
 'partai',
 'demokrasi',
 'indonesia',
 'juang',
 'ppp',
 'tolak',
 'jokowi',
 'presiden',
 'doa',
 'periode',
 'partai',
 'bangun',
 'ppp',
 'kubu',
 'main',
 'disc',
 'jockey',
 'fariz',
 'tolak',
 'jokowi',
 'pimpin',
 'indonesia',
 'review',
 'doa',
 'periode',
 'direktur',
 'negara',
 'residen',
 'tumbuh',
 'ekonomi',
 'lestari',
 'lingkung',
 'tumbuh',
 'nyanyi',
 'bantu',
 'perintah',
 'indonesia',
 'lestari',
 'sumber',
 'daya',
 'alam',
 'emisi',
 'saham',
 'rumah',
 'kaca',
 'umat',
 'hindu',
 'selamat',
 'ulang',
 'presiden',
 'soeharto',
 '',
 '',
 'eh',
 'ena',
 'susilo',
 'bambang',
 'yudhoyono',
 'koru

In [159]:
outOfVocabTrans = []
for i in tokensTestTrans:
    if i not in fitur_trans:
        outOfVocabTrans.append(i)
outOfVocabTrans

['emisi',
 '',
 '',
 'kelulus',
 'pilihjokowi',
 'bmw',
 'arief',
 'korg',
 'korg',
 'sunatullah',
 'khalifatullah',
 'wooseok',
 'fatimah',
 'laman',
 'busung',
 'ode',
 'hanafi',
 'hifzil',
 'kalender',
 'atletik',
 'pejam',
 'eeeeeh',
 'arra',
 'florida',
 'denuklirisasi',
 'bapuk',
 'tawau',
 'panggang',
 'kas',
 'karisma',
 'biarpun',
 'kamprettt',
 'chibi',
 'kasak',
 'kusuk',
 'achhh',
 'ik',
 'scroll',
 'peno',
 'lkai',
 'fana',
 'benalu',
 'riba',
 'kubus',
 'makamya',
 'lepeh',
 'limbat',
 'turun-langsung',
 'turun-',
 'komoditi',
 'yukkk',
 'eduwisata',
 'piknik',
 'pukuljabon',
 'animo',
 'persetan',
 'atos',
 'slawi',
 'dian',
 'nyong',
 'digawe',
 'x',
 'alaaaahhhhh',
 'kelok',
 'junedi',
 'buntut',
 'unicorn',
 'ironis',
 'caca',
 'nuy',
 'acap',
 'acuh',
 'feronikel',
 'jengkel',
 'taha',
 'lazio',
 'kylaaaaaaya',
 'muuuaaak',
 'cuaca',
 'p',
 'fabel',
 'bidadari',
 'siiichhh',
 'nyerem',
 'cy',
 'ngiahahaha',
 'bumbung',
 'kuwum',
 'b',
 'c',
 'drasex',
 'luvian',
 'pi

In [160]:
print("# out of vocabulary : ", len(outOfVocabTrans))
print("# out of vocabulary (unique words) : ", len(np.unique(outOfVocabTrans)))

# out of vocabulary :  1018
# out of vocabulary (unique words) :  874


## Save File

In [61]:
data_train_full_tfidf.to_csv('../Data/data_train_full_tfidf.csv', index = False)
data_test_full_tfidf.to_csv('../Data/data_test_full_tfidf.csv', index = False)

In [162]:
data_train_trans_tfidf.to_csv('../Data/data_train_trans_tfidffull.csv', index = False)
data_test_trans_tfidf.to_csv('../Data/data_test_trans_tfidffull.csv', index = False)