# Import Dependencies and Data

In [3]:
import pandas as pd
import numpy as np
import string
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from IPython.display import clear_output
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tag import CRFTagger
from sklearn_crfsuite import CRF

## Load Word Embedding (Word2Vec)

In [4]:
resource_path = "/content/resources/"
model_path = "idwiki_word2vec_300.model"

modelword2vec = Word2Vec.load(resource_path + model_path)
w2v = dict(zip(modelword2vec.wv.index2word, modelword2vec.wv.syn0))

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(word2vec['dan'])
        
    def tokenize(self, sentences):
        return [str(sentence).lower().split(" ") for sentence in sentences]

    
    def transform(self, X):
        # Ambil kata-katanya lalu rata-rata
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
vectorizer = MeanEmbeddingVectorizer(w2v)

  after removing the cwd from sys.path.


## Load Data

In [6]:
data_path = "/content/data_source/preprocessed_result/"
train_path = "preprocessed_train.csv"
test_path = "preprocessed_test.csv"

train = pd.read_csv(data_path + train_path)
test = pd.read_csv(data_path + test_path)

train = train.drop(['Unnamed: 0'], axis=1)
test = test.drop(['Unnamed: 0'], axis=1)

In [7]:
train.head()

Unnamed: 0,sent_A,sent_B,category,label,preprocessed_A,preprocessed_B
0,Pada awalnya bangsa Israel hanya terdiri dari ...,Pada awalnya bangsa Yahudi hanya terdiri dari ...,menolak perubahan teks terakhir oleh istimewa ...,0,awal bangsa israel diri satu kelompok keluarga...,awal bangsa yahudi diri satu kelompok keluarga...
1,"Salah satu tekniknya adalah periplus , deskrip...",Bangsa Romawi memberi sumbangan pada pemetaan ...,sejarah geografi,0,salah satu teknik periplus deskripsi labuh dar...,bangsa romawi beri sumbang meta mereka banyak ...
2,Etnolinguistik antropologi adalah ilmu yang me...,Etnolinguistik antropologi adalah ilmu yang me...,definisi anthropologi menurut para ahli,0,etnolinguistik antropologi ilmu ajar lukis cir...,etnolinguistik antropologi ilmu ajar sukusuku ...
3,"Sekarang , tidak ada yang tahu pasti kapan sej...",Sejarah dimulai pada awal penciptaan manusia .,1 penggantian istilah quot prasejarah quot den...,0,sekarang ada tahu kapan sejarah mulai,sejarah mulai awal cipta manusia
4,"Soekarno tidak memilih bahasanya sendiri , Jaw...","Dengan memilih Bahasa Melayu , para pejuang ke...",bahasa indonesia,0,soekarno pilih bahasa sendiri jawa benar bahas...,pilih bahasa melayu juang merdeka satu seperti...


In [8]:
test.head()

Unnamed: 0,sent_A,sent_B,category,label,preprocessed_A,preprocessed_B
0,"Pada 1964 , setelah memegang berbagai jabatan ...","Pada 1964 , setelah memegang berbagai jabatan ...",latar belakang,0,1964 pegang bagai jabat perintah mesir pilih p...,1964 pegang bagai jabat perintah mesir pilih p...
1,Bahasa Indonesia adalah bahasa resmi Negara Re...,Bahasa Indonesia adalah bahasa Resmi Indonesia...,merapikan,1,bahasa indonesia bahasa resmi negara republik ...,bahasa indonesia bahasa resmi indonesia bagi k...
2,Biologi menumpukan kepada ciri-ciri fisikal da...,Ilmu biologi berurusan dengan ciri-ciri fisik ...,menerjemahkan artikel dari bahasa malaysia ke ...,1,biologi tumpu ciriciri fisikal tabiat hidup ma...,ilmu biologi urus ciriciri fisik perilaku makh...
3,Seluruh anggota batalion Bali tersebut tewas s...,Seluruh anggota batalion Bali tersebut hidup s...,sejarah,0,seluruh anggota batalion bal sebut tewas semua...,seluruh anggota batalion bal sebut hidup semua...
4,Transportasi udara dilayani oleh Bandara Inter...,Transportasi udara dilayani oleh Bandara Inter...,transportasi,1,transportasi udara layan bandara internasional...,transportasi udara layan bandara internasional...


# Feature Extraction

## Distance Based: WMD

Calculate the distance of the word embedidng using Word Mover's Distance (WMD)

In [9]:
def wmd(a,b):
  return modelword2vec.wmdistance(a.split(),b.split())

In [11]:
train['wmd'] = train.apply(lambda x: wmd(x['preprocessed_A'], x['preprocessed_B']), axis=1)
test['wmd'] = test.apply(lambda x: wmd(x['preprocessed_A'], x['preprocessed_B']), axis=1)

  


## Token Based: raw_similar_tok, similar_tok, raw_diff_tok, diff_tok, num_tok_T, num_tok_H

*   Percentage of token similarity: raw (raw_similar_tok) & preprocessed (similar_tok)
*   The difference in the number of tokens: raw (raw_diff_tok) & preprocessed (diff_tok)
*   num of token in preprocessed text T (num_sentA) --> in paper: num_tok_T
*   num of token in preprocessed hypothesis H (num_sentB) --> in paper: num_tok_H

In [12]:
def check_token_func(df, similar_type):
  if similar_type == 'preprocessed':
    idx_1 = 'preprocessed_A'
    idx_2 = 'preprocessed_B'
  else:
    idx_1 = 'sent_A'
    idx_2 = 'sent_B'

  list_premis = df[idx_1].values
  list_hipotesis = df[idx_2].values
  list_similar = []
  list_diff = []
  num_premis = []
  num_hipotesis = []

  for p, h in zip(list_premis, list_hipotesis):
    # check percentage of similar words
    p_split = p.split()
    h_split = h.split()

    similar = 0
    for word in h_split:
      if word in p_split:
        similar += 1
    similar_percentage = similar / (len(h_split))

    # check difference of number of tokens in premis and hipotesis
    diff = len(p_split) - len(h_split)
    
    list_diff.append(diff)
    list_similar.append(similar_percentage)
    num_premis.append(len(p_split))
    num_hipotesis.append(len(h_split))
    
  return list_similar, list_diff, num_premis, num_hipotesis

In [14]:
similar_train, diff_train, train_num_A, train_num_B = check_token_func(train, 'preprocessed')
raw_similar_train, raw_diff_train, raw_train_num_A, raw_train_num_B = check_token_func(train, 'raw')

similar_test, diff_test, test_num_A, test_num_B = check_token_func(test, 'preprocessed')
raw_similar_test, raw_diff_test, raw_test_num_A, raw_test_num_B = check_token_func(test, 'raw')

In [15]:
train['similar_tok'] = similar_train
train['raw_similar_tok'] = raw_similar_train
train['diff_tok'] = diff_train
train['raw_diff_tok'] = raw_diff_train
train['num_sentA'] = train_num_A
train['num_sentB'] = train_num_B

test['similar_tok'] = similar_test
test['raw_similar_tok'] = raw_similar_test
test['diff_tok'] = diff_test
test['raw_diff_tok'] = raw_diff_test
test['num_sentA'] = test_num_A
test['num_sentB'] = test_num_B

In [17]:
train.head(3)

Unnamed: 0,sent_A,sent_B,category,label,preprocessed_A,preprocessed_B,wmd,similar_tok,raw_similar_tok,diff_tok,raw_diff_tok,num_sentA,num_sentB
0,Pada awalnya bangsa Israel hanya terdiri dari ...,Pada awalnya bangsa Yahudi hanya terdiri dari ...,menolak perubahan teks terakhir oleh istimewa ...,0,awal bangsa israel diri satu kelompok keluarga...,awal bangsa yahudi diri satu kelompok keluarga...,0.98994,0.944444,0.96,0,0,18,18
1,"Salah satu tekniknya adalah periplus , deskrip...",Bangsa Romawi memberi sumbangan pada pemetaan ...,sejarah geografi,0,salah satu teknik periplus deskripsi labuh dar...,bangsa romawi beri sumbang meta mereka banyak ...,21.54255,0.083333,0.25,24,38,36,12
2,Etnolinguistik antropologi adalah ilmu yang me...,Etnolinguistik antropologi adalah ilmu yang me...,definisi anthropologi menurut para ahli,0,etnolinguistik antropologi ilmu ajar lukis cir...,etnolinguistik antropologi ilmu ajar sukusuku ...,9.921097,1.0,1.0,6,9,15,9


## Token Based: same_unigram, same_bigram

*   num of overlap unigrams on raw data of sent_A & sent_B (same_unigram)
*   num of overlap bigrams on raw data of sent_A & sent_B (same_bigram)


In [18]:
from collections import Counter
from itertools import chain

def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

In [19]:
# generate unigram and bigram
train['sentA_unigram'] = train['sent_A'].map(lambda x: find_ngrams(x.split(" "), 1))
train['sentB_unigram'] = train['sent_B'].map(lambda x: find_ngrams(x.split(" "), 1))
train['sentA_bigram'] = train['sent_A'].map(lambda x: find_ngrams(x.split(" "), 2))
train['sentB_bigram'] = train['sent_B'].map(lambda x: find_ngrams(x.split(" "), 2))

test['sentA_unigram'] = test['sent_A'].map(lambda x: find_ngrams(x.split(" "), 1))
test['sentB_unigram'] = test['sent_B'].map(lambda x: find_ngrams(x.split(" "), 1))
test['sentA_bigram'] = test['sent_A'].map(lambda x: find_ngrams(x.split(" "), 2))
test['sentB_bigram'] = test['sent_B'].map(lambda x: find_ngrams(x.split(" "), 2))

In [20]:
train['same_unigram'] = train.apply(lambda x: len(set(x['sentA_unigram']) & set(x['sentB_unigram'])), axis=1)
train['same_bigram'] = train.apply(lambda x: len(set(x['sentA_bigram']) & set(x['sentB_bigram'])), axis=1)

test['same_unigram'] = test.apply(lambda x: len(set(x['sentA_unigram']) & set(x['sentB_unigram'])), axis=1)
test['same_bigram'] = test.apply(lambda x: len(set(x['sentA_bigram']) & set(x['sentB_bigram'])), axis=1)

## POS Tag: same_postag, fullmatch_postag, fullmatch_postag_pct

*   number of similar POS Tag on sent_A and sent_B (same_postag)
*   Number and percentage of similar pair of POS Tag-token (fullmatch_postag, fullmatch_postag_pct)


In [21]:
# Load Pretrained model
ct = CRFTagger()
pretrained_path = "all_indo_man_tag_corpus_model.crf.tagger"
ct.set_model_file(resource_path + pretrained_path)

In [22]:
def postag_func(sentence):
  s_split = sentence.split()

  postag = ct.tag_sents([s_split])
  return postag

In [23]:
def generate_postag_func(list_sentences):
  list_postag = []
  for s in list_sentences:
    postag = postag_func(s)
    postag = postag[0]
    list_postag.append(postag)
  
  return list_postag

In [24]:
train['sentA_postag'] = generate_postag_func(train['sent_A'])
train['sentB_postag'] = generate_postag_func(train['sent_B'])

test['sentA_postag'] = generate_postag_func(test['sent_A'])
test['sentB_postag'] = generate_postag_func(test['sent_B'])

In [25]:
train['sentA_postag_only'] = train['sentA_postag'].map(lambda l: [b for a, b in l])
train['sentB_postag_only'] = train['sentB_postag'].map(lambda l: [b for a, b in l])

test['sentA_postag_only'] = test['sentA_postag'].map(lambda l: [b for a, b in l])
test['sentB_postag_only'] = test['sentB_postag'].map(lambda l: [b for a, b in l])

Calculate the number of the same POS Tag

In [26]:
train['same_postag'] = [len(list((Counter(a) & Counter(b)).elements())) for a,b in zip(train["sentA_postag_only"], train["sentB_postag_only"])]
test['same_postag'] = [len(list((Counter(a) & Counter(b)).elements())) for a,b in zip(test["sentA_postag_only"], test["sentB_postag_only"])]

Count pair similarity (full match, POS tag and token) between sent A and sent B and its percentage

In [27]:
def count_match_func(premis, hipotesis):
  list_match = []
  for p, h in zip(premis, hipotesis):
    match = 0
    for i in h:
      if i in p:
        match += 1
    list_match.append(match)
  
  return list_match

In [28]:
def similar_postag_func(num_similar, tag):
  similar_percentage = []
  for n, t in zip(num_similar, tag):
    similar = n/len(t)
    similar_percentage.append(similar)
  
  return similar_percentage

In [29]:
train['match_postag'] = count_match_func(train['sentA_postag'].values, train['sentB_postag'].values)
test['match_postag'] = count_match_func(test['sentA_postag'].values, test['sentB_postag'].values)

train['match_postag_pct'] = similar_postag_func(train['match_postag'].values, train['sentB_postag'].values)
test['match_postag_pct'] = similar_postag_func(test['match_postag'].values, test['sentB_postag'].values)

## Negation: num of occurrences in A, num of occurrences in B, difference num of occurrences

The number of occurrences of each negation words in A and B, also the difference number of occurrences. The negation words are: "tidak", "belum", "tak", "bukan", dan "jangan". 

In [30]:
train['count_tidak_sentA'] = train['sent_A'].str.count('tidak')
train['count_belum_sentA'] = train['sent_A'].str.count('belum')
train['count_tak_sentA'] = train['sent_A'].str.count('tak')
train['count_bukan_sentA'] = train['sent_A'].str.count('bukan')
train['count_jangan_sentA'] = train['sent_A'].str.count('jangan')
train['count_tidak_sentB'] = train['sent_B'].str.count('tidak')
train['count_belum_sentB'] = train['sent_B'].str.count('belum')
train['count_tak_sentB'] = train['sent_B'].str.count('tak')
train['count_bukan_sentB'] = train['sent_B'].str.count('bukan')
train['count_jangan_sentB'] = train['sent_B'].str.count('jangan')

test['count_tidak_sentA'] = test['sent_A'].str.count('tidak')
test['count_belum_sentA'] = test['sent_A'].str.count('belum')
test['count_tak_sentA'] = test['sent_A'].str.count('tak')
test['count_bukan_sentA'] = test['sent_A'].str.count('bukan')
test['count_jangan_sentA'] = test['sent_A'].str.count('jangan')
test['count_tidak_sentB'] = test['sent_B'].str.count('tidak')
test['count_belum_sentB'] = test['sent_B'].str.count('belum')
test['count_tak_sentB'] = test['sent_B'].str.count('tak')
test['count_bukan_sentB'] = test['sent_B'].str.count('bukan')
test['count_jangan_sentB'] = test['sent_B'].str.count('jangan')

In [31]:
train['diff_tidak'] = np.abs(train['count_tidak_sentA']-train['count_tidak_sentB'])
train['diff_belum'] = np.abs(train['count_belum_sentA']-train['count_belum_sentB'])
train['diff_tak'] = np.abs(train['count_tak_sentA']-train['count_tak_sentB'])
train['diff_bukan'] = np.abs(train['count_bukan_sentA']-train['count_bukan_sentB'])
train['diff_jangan'] = np.abs(train['count_jangan_sentA']-train['count_jangan_sentB'])

test['diff_tidak'] = np.abs(test['count_tidak_sentA']-test['count_tidak_sentB'])
test['diff_belum'] = np.abs(test['count_belum_sentA']-test['count_belum_sentB'])
test['diff_tak'] = np.abs(test['count_tak_sentA']-test['count_tak_sentB'])
test['diff_bukan'] = np.abs(test['count_bukan_sentA']-test['count_bukan_sentB'])
test['diff_jangan'] = np.abs(test['count_jangan_sentA']-test['count_jangan_sentB'])

## Distance Base: Levenshtein Distance

In [32]:
def levenshtein_distance(a, b):
    """Return the Levenshtein edit distance between two strings *a* and *b*."""
    if a == b:
        return 0
    if len(a) < len(b):
        a, b = b, a
    if not a:
        return len(b)
    previous_row = range(len(b) + 1)
    for i, column1 in enumerate(a):
        current_row = [i + 1]
        for j, column2 in enumerate(b):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (column1 != column2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1] 

In [34]:
train['lev_dist'] = [levenshtein_distance(a,b) for a, b in zip(train['preprocessed_A'], train['preprocessed_B'])]
test['lev_dist'] = [levenshtein_distance(a,b) for a, b in zip(test['preprocessed_A'], test['preprocessed_B'])]

## BLEU Score: bleu_1gram, bleu_2gram, bleu_3gram, bleu_4gram

In [35]:
# cumulative BLEU scores
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

smoothie = SmoothingFunction().method4

train['bleu_1gram'] = train.apply(lambda row: sentence_bleu(row['preprocessed_A'],row['preprocessed_B'],weights=(1, 0, 0, 0),smoothing_function=smoothie), axis=1)
train['bleu_2gram'] = train.apply(lambda row: sentence_bleu(row['preprocessed_A'],row['preprocessed_B'],weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie), axis=1)
train['bleu_3gram'] = train.apply(lambda row: sentence_bleu(row['preprocessed_A'],row['preprocessed_B'],weights=(0.33, 0.33, 0.33, 0),smoothing_function=smoothie), axis=1)
train['bleu_4gram'] = train.apply(lambda row: sentence_bleu(row['preprocessed_A'],row['preprocessed_B'],weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie), axis=1)

test['bleu_1gram'] = test.apply(lambda row: sentence_bleu(row['preprocessed_A'],row['preprocessed_B'],weights=(1, 0, 0, 0),smoothing_function=smoothie), axis=1)
test['bleu_2gram'] = test .apply(lambda row: sentence_bleu(row['preprocessed_A'],row['preprocessed_B'],weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie), axis=1)
test['bleu_3gram'] = test.apply(lambda row: sentence_bleu(row['preprocessed_A'],row['preprocessed_B'],weights=(0.33, 0.33, 0.33, 0),smoothing_function=smoothie), axis=1)
test['bleu_4gram'] = test.apply(lambda row: sentence_bleu(row['preprocessed_A'],row['preprocessed_B'],weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie), axis=1)

## Distance Based: Jaccard Similarity

In [37]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def extract_text_similarity_jaccard (text1, text2):
    words_text1 = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text1)]
    words_text2 = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(text2)]
    nr = len(set(words_text1).intersection(set(words_text2)))
    dr = len(set(words_text1).union(set(words_text2)))
    jaccard_sim = nr/dr
    return jaccard_sim

In [40]:
train['jaccard_sim'] = [extract_text_similarity_jaccard(a, b) for a, b in zip(train['preprocessed_A'], train['preprocessed_B'])]
test['jaccard_sim'] = [extract_text_similarity_jaccard(a, b) for a, b in zip(test['preprocessed_A'], test['preprocessed_B'])]

In [41]:
train.head(3)

Unnamed: 0,sent_A,sent_B,category,label,preprocessed_A,preprocessed_B,wmd,similar_tok,raw_similar_tok,diff_tok,raw_diff_tok,num_sentA,num_sentB,sentA_unigram,sentB_unigram,sentA_bigram,sentB_bigram,same_unigram,same_bigram,sentA_postag,sentB_postag,sentA_postag_only,sentB_postag_only,same_postag,match_postag,match_postag_pct,count_tidak_sentA,count_belum_sentA,count_tak_sentA,count_bukan_sentA,count_jangan_sentA,count_tidak_sentB,count_belum_sentB,count_tak_sentB,count_bukan_sentB,count_jangan_sentB,diff_tidak,diff_belum,diff_tak,diff_bukan,diff_jangan,lev_dist,bleu_1gram,bleu_2gram,bleu_3gram,bleu_4gram,jaccard_sim
0,Pada awalnya bangsa Israel hanya terdiri dari ...,Pada awalnya bangsa Yahudi hanya terdiri dari ...,menolak perubahan teks terakhir oleh istimewa ...,0,awal bangsa israel diri satu kelompok keluarga...,awal bangsa yahudi diri satu kelompok keluarga...,0.98994,0.944444,0.96,0,0,18,18,"[(Pada,), (awalnya,), (bangsa,), (Israel,), (h...","[(Pada,), (awalnya,), (bangsa,), (Yahudi,), (h...","[(Pada, awalnya), (awalnya, bangsa), (bangsa, ...","[(Pada, awalnya), (awalnya, bangsa), (bangsa, ...",21,21,"[(Pada, IN), (awalnya, NN), (bangsa, NN), (Isr...","[(Pada, IN), (awalnya, NN), (bangsa, NN), (Yah...","[IN, NN, NN, NNP, RB, VB, IN, CD, NN, NN, IN, ...","[IN, NN, NN, NNP, RB, VB, IN, CD, NN, NN, IN, ...",25,24,0.96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0.203704,0.31386,0.321506,0.298068,0.882353
1,"Salah satu tekniknya adalah periplus , deskrip...",Bangsa Romawi memberi sumbangan pada pemetaan ...,sejarah geografi,0,salah satu teknik periplus deskripsi labuh dar...,bangsa romawi beri sumbang meta mereka banyak ...,21.54255,0.083333,0.25,24,38,36,12,"[(Salah,), (satu,), (tekniknya,), (adalah,), (...","[(Bangsa,), (Romawi,), (memberi,), (sumbangan,...","[(Salah, satu), (satu, tekniknya), (tekniknya,...","[(Bangsa, Romawi), (Romawi, memberi), (memberi...",4,0,"[(Salah, NN), (satu, CD), (tekniknya, RB), (ad...","[(Bangsa, NNP), (Romawi, NNP), (memberi, VB), ...","[NN, CD, RB, VB, NN, Z, NN, IN, NN, CC, NN, NN...","[NNP, NNP, VB, NN, IN, NN, SC, PRP, CD, VB, NN...",15,4,0.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,191,0.220779,0.32037,0.323028,0.297627,0.026316
2,Etnolinguistik antropologi adalah ilmu yang me...,Etnolinguistik antropologi adalah ilmu yang me...,definisi anthropologi menurut para ahli,0,etnolinguistik antropologi ilmu ajar lukis cir...,etnolinguistik antropologi ilmu ajar sukusuku ...,9.921097,1.0,1.0,6,9,15,9,"[(Etnolinguistik,), (antropologi,), (adalah,),...","[(Etnolinguistik,), (antropologi,), (adalah,),...","[(Etnolinguistik, antropologi), (antropologi, ...","[(Etnolinguistik, antropologi), (antropologi, ...",14,13,"[(Etnolinguistik, NN), (antropologi, NN), (ada...","[(Etnolinguistik, NN), (antropologi, NN), (ada...","[NN, NN, VB, NN, SC, VB, NN, IN, NN, CC, NN, N...","[NN, NN, VB, NN, SC, VB, NN, NN, SC, VB, IN, N...",15,15,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43,0.268657,0.350317,0.341297,0.309585,0.642857


# Modelling Preparation

## Normalization


In [42]:
from scipy.stats import zscore
cols_norm = ['wmd',
        'similar_tok',
        'raw_similar_tok',
        'diff_tok',
        'raw_diff_tok',
        'num_sentA',
        'num_sentB',
        'same_unigram',
        'same_bigram',
        'same_postag',
        'match_postag',
        'match_postag_pct',
        'lev_dist',
        'bleu_1gram',
        'bleu_2gram',
        'bleu_3gram',
        'bleu_4gram',
        'jaccard_sim']

norm_train = train.copy()
norm_train[cols_norm] = norm_train[cols_norm].apply(zscore)

norm_test = test.copy()
norm_test[cols_norm] = norm_test[cols_norm].apply(zscore)

## Split Data

In [43]:
norm_train.head()

Unnamed: 0,sent_A,sent_B,category,label,preprocessed_A,preprocessed_B,wmd,similar_tok,raw_similar_tok,diff_tok,raw_diff_tok,num_sentA,num_sentB,sentA_unigram,sentB_unigram,sentA_bigram,sentB_bigram,same_unigram,same_bigram,sentA_postag,sentB_postag,sentA_postag_only,sentB_postag_only,same_postag,match_postag,match_postag_pct,count_tidak_sentA,count_belum_sentA,count_tak_sentA,count_bukan_sentA,count_jangan_sentA,count_tidak_sentB,count_belum_sentB,count_tak_sentB,count_bukan_sentB,count_jangan_sentB,diff_tidak,diff_belum,diff_tak,diff_bukan,diff_jangan,lev_dist,bleu_1gram,bleu_2gram,bleu_3gram,bleu_4gram,jaccard_sim
0,Pada awalnya bangsa Israel hanya terdiri dari ...,Pada awalnya bangsa Yahudi hanya terdiri dari ...,menolak perubahan teks terakhir oleh istimewa ...,0,awal bangsa israel diri satu kelompok keluarga...,awal bangsa yahudi diri satu kelompok keluarga...,-0.847933,0.611912,0.661902,-0.645613,-0.66471,0.310952,0.710658,"[(Pada,), (awalnya,), (bangsa,), (Israel,), (h...","[(Pada,), (awalnya,), (bangsa,), (Yahudi,), (h...","[(Pada, awalnya), (awalnya, bangsa), (bangsa, ...","[(Pada, awalnya), (awalnya, bangsa), (bangsa, ...",0.772835,0.74649,"[(Pada, IN), (awalnya, NN), (bangsa, NN), (Isr...","[(Pada, IN), (awalnya, NN), (bangsa, NN), (Yah...","[IN, NN, NN, NNP, RB, VB, IN, CD, NN, NN, IN, ...","[IN, NN, NN, NNP, RB, VB, IN, CD, NN, NN, IN, ...",0.676499,0.728672,0.681871,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.702715,-0.451398,-0.327007,-0.234414,-0.157265,0.772372
1,"Salah satu tekniknya adalah periplus , deskrip...",Bangsa Romawi memberi sumbangan pada pemetaan ...,sejarah geografi,0,salah satu teknik periplus deskripsi labuh dar...,bangsa romawi beri sumbang meta mereka banyak ...,1.814214,-2.060941,-1.733538,5.72208,5.740784,2.858967,-0.210281,"[(Salah,), (satu,), (tekniknya,), (adalah,), (...","[(Bangsa,), (Romawi,), (memberi,), (sumbangan,...","[(Salah, satu), (satu, tekniknya), (tekniknya,...","[(Bangsa, Romawi), (Romawi, memberi), (memberi...",-1.138638,-1.173056,"[(Salah, NN), (satu, CD), (tekniknya, RB), (ad...","[(Bangsa, NNP), (Romawi, NNP), (memberi, VB), ...","[NN, CD, RB, VB, NN, Z, NN, IN, NN, CC, NN, NN...","[NNP, NNP, VB, NN, IN, NN, SC, PRP, CD, VB, NN...",-0.35318,-1.066087,-1.711456,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.392081,-0.270615,-0.201895,-0.183791,-0.180667,-1.780867
2,Etnolinguistik antropologi adalah ilmu yang me...,Etnolinguistik antropologi adalah ilmu yang me...,definisi anthropologi menurut para ahli,0,etnolinguistik antropologi ilmu ajar lukis cir...,etnolinguistik antropologi ilmu ajar sukusuku ...,0.308906,0.784355,0.796857,0.94631,0.85238,-0.113717,-0.67075,"[(Etnolinguistik,), (antropologi,), (adalah,),...","[(Etnolinguistik,), (antropologi,), (adalah,),...","[(Etnolinguistik, antropologi), (antropologi, ...","[(Etnolinguistik, antropologi), (antropologi, ...",-0.014242,0.015234,"[(Etnolinguistik, NN), (antropologi, NN), (ada...","[(Etnolinguistik, NN), (antropologi, NN), (ada...","[NN, NN, VB, NN, SC, VB, NN, IN, NN, CC, NN, N...","[NN, NN, VB, NN, SC, VB, NN, NN, SC, VB, IN, N...",-0.35318,-0.078969,0.816706,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.316245,0.236277,0.373584,0.423815,0.453302,0.058046
3,"Sekarang , tidak ada yang tahu pasti kapan sej...",Sejarah dimulai pada awal penciptaan manusia .,1 penggantian istilah quot prasejarah quot den...,0,sekarang ada tahu kapan sejarah mulai,sejarah mulai awal cipta manusia,1.28311,-1.07802,-1.613043,-0.380293,0.009552,-1.387724,-1.284709,"[(Sekarang,), (,,), (tidak,), (ada,), (yang,),...","[(Sejarah,), (dimulai,), (pada,), (awal,), (pe...","[(Sekarang, ,), (,, tidak), (tidak, ada), (ada...","[(Sejarah, dimulai), (dimulai, pada), (pada, a...",-1.363517,-1.173056,"[(Sekarang, NN), (,, Z), (tidak, NEG), (ada, V...","[(Sejarah, NN), (dimulai, VB), (pada, IN), (aw...","[NN, Z, NEG, VB, SC, VB, RB, NN, NN, VB, Z]","[NN, VB, IN, NN, NN, NN, Z]",-1.382858,-1.245562,-1.591068,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-0.207005,2.023867,1.774324,1.611511,1.480318,-1.196551
4,"Soekarno tidak memilih bahasanya sendiri , Jaw...","Dengan memilih Bahasa Melayu , para pejuang ke...",bahasa indonesia,0,soekarno pilih bahasa sendiri jawa benar bahas...,pilih bahasa melayu juang merdeka satu seperti...,1.567656,-1.543614,-1.409128,0.150348,0.346683,0.452509,0.403678,"[(Soekarno,), (tidak,), (memilih,), (bahasanya...","[(Dengan,), (memilih,), (Bahasa,), (Melayu,), ...","[(Soekarno, tidak), (tidak, memilih), (memilih...","[(Dengan, memilih), (memilih, Bahasa), (Bahasa...",-0.57644,-0.990242,"[(Soekarno, NNP), (tidak, NEG), (memilih, VB),...","[(Dengan, SC), (memilih, VB), (Bahasa, NNP), (...","[NNP, NEG, VB, NN, NN, Z, NNP, Z, SC, RB, RB, ...","[SC, VB, NNP, NNP, Z, DT, NN, NN, PR, RB, IN, ...",0.058692,-0.617397,-1.387333,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1.335204,-0.673852,-0.658476,-0.635582,-0.616625,-1.400491


In [48]:
# Define all column
cols = ['wmd',
        'similar_tok',
        'raw_similar_tok',
        'diff_tok',
        'raw_diff_tok',
        'num_sentA',
        'num_sentB',
        'same_unigram',
        'same_bigram',
        'same_postag',
        'match_postag',
        'match_postag_pct',
        'count_tidak_sentA',
        'count_belum_sentA',
        'count_tak_sentA',
        'count_bukan_sentA',
        'count_jangan_sentA',
        'count_tidak_sentB',
        'count_belum_sentB',
        'count_tak_sentB',
        'count_bukan_sentB',
        'count_jangan_sentB',
        'diff_tidak',
        'diff_belum',
        'diff_tak',
        'diff_bukan',
        'diff_jangan',
        'lev_dist',
        'bleu_1gram',
        'bleu_2gram',
        'bleu_3gram',
        'bleu_4gram',
        'jaccard_sim']

cols_model = cols
cols_model.append('preprocessed_A')
cols_model.append('preprocessed_B')

In [49]:
x_train = norm_train[cols_model].values
x_test = norm_test[cols_model].values

y_train = norm_train['label'].values
y_test = norm_test['label'].values

## Distance Based: Euclidean Distance

In [50]:
# concatenate premis and hipotesis to get new corpus
def get_corpus_func(sentences):
  n = len(sentences)

  corpus = []
  corpus_premis = []
  corpus_hipotesis = []
  corpus_label = []

  for row in sentences:
    p = row[-2]
    h = row[-1]
    c = p + " " + h

    corpus_premis.append(p)
    corpus_hipotesis.append(h)
    corpus.append(c)
  
  return corpus_premis, corpus_hipotesis, corpus

In [51]:
# function to calculate the euclidean distance
def get_feature_func(tfidf_premis, tfidf_hipotesis):
  # Euclidean distance between TFIDF vectors for sentence1 and sentence2
  tfidf_distance = tfidf_premis - tfidf_hipotesis
  tfidf_feature = [np.linalg.norm(tfidf_distance[ind].toarray()) for ind in range(tfidf_distance.shape[0])]
  tfidf_feature_array = np.asarray(tfidf_feature).reshape(-1, 1)

  return tfidf_feature_array

In [52]:
# get corpus for train and test data
train_A, train_B, train_corpus = get_corpus_func(x_train)
test_A, test_B, test_corpus = get_corpus_func(x_test)

In [53]:
# extract TF-IDF feature using default parameter
# fit TF-IDF by combine corpus (premis + hipotesis) of train data
vectorizer = TfidfVectorizer()
vectorizer.fit(train_corpus)

train_tfidf_A = vectorizer.transform(train_A)
train_tfidf_B = vectorizer.transform(train_B)

test_tfidf_A = vectorizer.transform(test_A)
test_tfidf_B = vectorizer.transform(test_B)

In [54]:
# get calculated feature
train_feature = get_feature_func(train_tfidf_A, train_tfidf_B)
test_feature = get_feature_func(test_tfidf_A, test_tfidf_B)

## Token Based: L2 Norm for Vectorize Bigram

In [55]:
import collections
import nltk
import os
from sklearn import feature_extraction

def extract_features(corpus, hipotesis_corpus):
    '''Extract TF-IDF features from corpus'''
   
    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(ngram_range=(2, 2))
    processed_corpus = count_vectorizer.fit_transform(hipotesis_corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus

In [56]:
ngram = extract_features(train_corpus, train_B)
bigram_train = [np.linalg.norm(ngram[ind].toarray()) for ind in range(ngram.shape[0])]
bigram_array_train = np.asarray(bigram_train).reshape(-1, 1)

ngram_test = extract_features(train_corpus, test_B)
bigram_test = [np.linalg.norm(ngram_test[ind].toarray()) for ind in range(ngram_test.shape[0])]
bigram_array_test = np.asarray(bigram_test).reshape(-1, 1)

## Concat Features
Define features for each group

In [58]:
cols_dist = ['wmd', 'lev_dist', 'jaccard_sim']
cols_tok = ['similar_tok', 'raw_similar_tok', 'diff_tok', 'raw_diff_tok', 'num_sentA', 'num_sentB', 'same_unigram', 'same_bigram']
cols_pos = ['same_postag', 'match_postag', 'match_postag_pct']
cols_neg = ['count_tidak_sentA', 'count_belum_sentA', 'count_tak_sentA', 'count_bukan_sentA', 'count_jangan_sentA', 
            'count_tidak_sentB', 'count_belum_sentB', 'count_tak_sentB', 'count_bukan_sentB', 'count_jangan_sentB',
            'diff_tidak', 'diff_belum', 'diff_tak', 'diff_bukan', 'diff_jangan']
cols_bleu = ['bleu_1gram', 'bleu_2gram', 'bleu_3gram', 'bleu_4gram'] 

In [59]:
x_train_dist = train[cols_dist].values
x_test_dist = test[cols_dist].values

x_train_dist = np.append(x_train_dist, train_feature, axis=1)
x_test_dist = np.append(x_test_dist, test_feature, axis=1)

x_train_tok = train[cols_tok].values
x_test_tok = test[cols_tok].values

x_train_tok = np.append(x_train_tok, bigram_array_train, axis=1)
x_test_tok = np.append(x_test_tok, bigram_array_test, axis=1)

x_train_pos = train[cols_pos].values
x_test_pos = test[cols_pos].values

x_train_neg = train[cols_neg].values
x_test_neg = test[cols_neg].values

x_train_bl = train[cols_bleu].values
x_test_bl = test[cols_bleu].values

# Training & Testing Scenario

In [62]:
import time
import sys

## All Features

In [60]:
x_train_1 = x_train_dist
x_test_1 = x_test_dist

x_train_1 = np.append(x_train_1, x_train_tok, axis=1)
x_test_1 = np.append(x_test_1, x_test_tok, axis=1)

x_train_1 = np.append(x_train_1, x_train_pos, axis=1)
x_test_1 = np.append(x_test_1, x_test_pos, axis=1)

x_train_1 = np.append(x_train_1, x_train_neg, axis=1)
x_test_1 = np.append(x_test_1, x_test_neg, axis=1)

x_train_1 = np.append(x_train_1, x_train_bl, axis=1)
x_test_1 = np.append(x_test_1, x_test_bl, axis=1)

len(x_train_1[0])

35

### Training & Testing

In [None]:
# Logistic Regression
start = time.time()
logreg_model1 = LogisticRegression(random_state=9, max_iter=1000, solver='lbfgs', multi_class='auto').fit(x_train_1, y_train)
stop = time.time()
start1 = time.time()
y_pred_logreg1 = logreg_model1.predict(x_test_1)
stop1 = time.time()

print(f"Training Time: {stop - start}")
print(f"Preidction Time: {stop1 - start1}")
print("Size: "+str(sys.getsizeof(logreg_model1))+ "")

Training Time: 0.24411606788635254
Preidction Time: 0.0002703666687011719
Size: 64


In [None]:
# SVM
start = time.time()
svm_model1 = svm.SVC(random_state=11, kernel = 'rbf').fit(x_train_1, y_train)
stop = time.time()
start1 = time.time()
y_pred_svm1 = svm_model1.predict(x_test_1)
stop1 = time.time()

print(f"Training Time: {stop - start}")
print(f"Prediction Time: {stop1 - start1}")
print("Size: "+str(sys.getsizeof(svm_model1))+ "")

Training Time: 0.008749008178710938
Prediction Time: 0.0012502670288085938
Size: 64


In [None]:
# MLP
start = time.time()
mlp_model1 = MLPClassifier(random_state=13, max_iter=1000).fit(x_train_1, y_train)
stop = time.time()
start1 = time.time()
y_pred_mlp1 = mlp_model1.predict(x_test_1)
stop1 = time.time()

print(f"Training Time: {stop - start}")
print(f"Prediction Time: {stop1 - start1}")
print("Size: "+str(sys.getsizeof(mlp_model1))+ "")

Training Time: 1.2142772674560547
Prediction Time: 0.0006837844848632812
Size: 64


### Evaluation

In [None]:
# Logistic Regression
print(classification_report(y_test, y_pred_logreg1, digits=4))

              precision    recall  f1-score   support

           0     0.8276    0.6154    0.7059        39
           1     0.7887    0.9180    0.8485        61

    accuracy                         0.8000       100
   macro avg     0.8082    0.7667    0.7772       100
weighted avg     0.8039    0.8000    0.7929       100



In [None]:
# SVM
print(classification_report(y_test, y_pred_svm1, digits=4))

              precision    recall  f1-score   support

           0     0.8889    0.6154    0.7273        39
           1     0.7945    0.9508    0.8657        61

    accuracy                         0.8200       100
   macro avg     0.8417    0.7831    0.7965       100
weighted avg     0.8313    0.8200    0.8117       100



In [None]:
# MLP
print(classification_report(y_test, y_pred_mlp1, digits=4))

              precision    recall  f1-score   support

           0     0.7879    0.6667    0.7222        39
           1     0.8060    0.8852    0.8438        61

    accuracy                         0.8000       100
   macro avg     0.7969    0.7760    0.7830       100
weighted avg     0.7989    0.8000    0.7964       100



## Remove Distance Based

In [None]:
x_train_2 = x_train_tok
x_valid_2 = x_valid_tok
x_test_2 = x_test_tok

x_train_2 = np.append(x_train_2, x_train_pos, axis=1)
x_test_2 = np.append(x_test_2, x_test_pos, axis=1)

x_train_2 = np.append(x_train_2, x_train_neg, axis=1)
x_test_2 = np.append(x_test_2, x_test_neg, axis=1)

x_train_2 = np.append(x_train_2, x_train_bl, axis=1)
x_test_2 = np.append(x_test_2, x_test_bl, axis=1)

len(x_train_2[0])

31

### Training & Testing

In [None]:
# Logistic Regression
start = time.time()
logreg_model2 = LogisticRegression(random_state=9, max_iter=1000, solver='lbfgs', multi_class='auto').fit(x_train_2, y_train)
stop = time.time()
y_pred_logreg2 = logreg_model2.predict(x_test_2)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(logreg_model2))+ "")

Training Time: 0.12523984909057617
Size: 64


In [None]:
# SVM
start = time.time()
svm_model2 = svm.SVC(random_state=11, kernel = 'rbf').fit(x_train_2, y_train)
stop = time.time()
y_pred_svm2 = svm_model2.predict(x_test_2)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(svm_model2))+ "")

Training Time: 0.009885549545288086
Size: 64


In [None]:
# MLP
start = time.time()
mlp_model2 = MLPClassifier(random_state=13, max_iter=1000).fit(x_train_2, y_train)
stop = time.time()
y_pred_mlp2 = mlp_model2.predict(x_test_2)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(mlp_model2))+ "")

Training Time: 1.0468215942382812
Size: 64


### Evaluation

In [None]:
# Logistic Regression
print(classification_report(y_test, y_pred_logreg2, digits=4))

              precision    recall  f1-score   support

           0     0.8571    0.6154    0.7164        39
           1     0.7917    0.9344    0.8571        61

    accuracy                         0.8100       100
   macro avg     0.8244    0.7749    0.7868       100
weighted avg     0.8172    0.8100    0.8023       100



In [None]:
# SVM
print(classification_report(y_test, y_pred_svm2, digits=4))

              precision    recall  f1-score   support

           0     0.8889    0.6154    0.7273        39
           1     0.7945    0.9508    0.8657        61

    accuracy                         0.8200       100
   macro avg     0.8417    0.7831    0.7965       100
weighted avg     0.8313    0.8200    0.8117       100



In [None]:
# MLP
print(classification_report(y_test, y_pred_mlp2, digits=4))

              precision    recall  f1-score   support

           0     0.8214    0.5897    0.6866        39
           1     0.7778    0.9180    0.8421        61

    accuracy                         0.7900       100
   macro avg     0.7996    0.7539    0.7643       100
weighted avg     0.7948    0.7900    0.7814       100



## Remove Token Based

In [None]:
x_train_3 = x_train_dist
x_test_3 = x_test_dist

x_train_3 = np.append(x_train_3, x_train_pos, axis=1)
x_test_3 = np.append(x_test_3, x_test_pos, axis=1)

x_train_3 = np.append(x_train_3, x_train_neg, axis=1)
x_test_3 = np.append(x_test_3, x_test_neg, axis=1)

x_train_3 = np.append(x_train_3, x_train_bl, axis=1)
x_test_3 = np.append(x_test_3, x_test_bl, axis=1)

len(x_train_3[0])

26

### Training & Testing

In [None]:
# Logistic Regression
start = time.time()
logreg_model3 = LogisticRegression(random_state=9, max_iter=1000, solver='lbfgs', multi_class='auto').fit(x_train_3, y_train)
stop = time.time()
y_pred_logreg3 = logreg_model3.predict(x_test_3)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(logreg_model3))+ "")

Training Time: 0.0652322769165039
Size: 64


In [None]:
# SVM
start = time.time()
svm_model3 = svm.SVC(random_state=11, kernel = 'rbf').fit(x_train_3, y_train)
stop = time.time()
y_pred_svm3 = svm_model3.predict(x_test_3)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(svm_model3))+ "")

Training Time: 0.008715629577636719
Size: 64


In [None]:
# MLP
start = time.time()
mlp_model3 = MLPClassifier(random_state=13, max_iter=1000).fit(x_train_3, y_train)
stop = time.time()
y_pred_mlp3 = mlp_model3.predict(x_test_3)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(mlp_model3))+ "")

Training Time: 0.9032843112945557
Size: 64


### Evaluation

In [None]:
# Logistic Regression
print(classification_report(y_test, y_pred_logreg3, digits=4))

              precision    recall  f1-score   support

           0     0.7667    0.5897    0.6667        39
           1     0.7714    0.8852    0.8244        61

    accuracy                         0.7700       100
   macro avg     0.7690    0.7375    0.7455       100
weighted avg     0.7696    0.7700    0.7629       100



In [None]:
# SVM
print(classification_report(y_test, y_pred_svm3, digits=4))

              precision    recall  f1-score   support

           0     0.8214    0.5897    0.6866        39
           1     0.7778    0.9180    0.8421        61

    accuracy                         0.7900       100
   macro avg     0.7996    0.7539    0.7643       100
weighted avg     0.7948    0.7900    0.7814       100



In [None]:
# MLP
print(classification_report(y_test, y_pred_mlp3, digits=4))

              precision    recall  f1-score   support

           0     0.8065    0.6410    0.7143        39
           1     0.7971    0.9016    0.8462        61

    accuracy                         0.8000       100
   macro avg     0.8018    0.7713    0.7802       100
weighted avg     0.8007    0.8000    0.7947       100



## Remove POS Tag 

In [None]:
x_train_4 = x_train_dist
x_test_4 = x_test_dist

x_train_4 = np.append(x_train_4, x_train_tok, axis=1)
x_test_4 = np.append(x_test_4, x_test_tok, axis=1)

x_train_4 = np.append(x_train_4, x_train_neg, axis=1)
x_test_4 = np.append(x_test_4, x_test_neg, axis=1)

x_train_4 = np.append(x_train_4, x_train_bl, axis=1)
x_test_4 = np.append(x_test_4, x_test_bl, axis=1)

len(x_train_4[0])

32

### Training & Testing

In [None]:
# Logistic Regression
start = time.time()
logreg_model4 = LogisticRegression(random_state=9, max_iter=1000, solver='lbfgs', multi_class='auto').fit(x_train_4, y_train)
stop = time.time()
y_pred_logreg4 = logreg_model4.predict(x_test_4)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(logreg_model4))+ "")

Training Time: 0.1703965663909912
Size: 64


In [None]:
# SVM
start = time.time()
svm_model4 = svm.SVC(random_state=11, kernel = 'rbf').fit(x_train_4, y_train)
stop = time.time()
y_pred_svm4 = svm_model4.predict(x_test_4)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(svm_model4))+ "")

Training Time: 0.009184122085571289
Size: 64


In [None]:
# MLP
start = time.time()
mlp_model4 = MLPClassifier(random_state=13, max_iter=1000).fit(x_train_4, y_train)
stop = time.time()
y_pred_mlp4 = mlp_model4.predict(x_test_4)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(mlp_model4))+ "")

Training Time: 1.2020533084869385
Size: 64


### Evaluation

In [None]:
# Logistic Regression
print(classification_report(y_test, y_pred_logreg4, digits=4))

              precision    recall  f1-score   support

           0     0.8571    0.6154    0.7164        39
           1     0.7917    0.9344    0.8571        61

    accuracy                         0.8100       100
   macro avg     0.8244    0.7749    0.7868       100
weighted avg     0.8172    0.8100    0.8023       100



In [None]:
# SVM
print(classification_report(y_test, y_pred_svm4, digits=4))

              precision    recall  f1-score   support

           0     0.8571    0.6154    0.7164        39
           1     0.7917    0.9344    0.8571        61

    accuracy                         0.8100       100
   macro avg     0.8244    0.7749    0.7868       100
weighted avg     0.8172    0.8100    0.8023       100



In [None]:
# MLP
print(classification_report(y_test, y_pred_mlp4, digits=4))

              precision    recall  f1-score   support

           0     0.7222    0.6667    0.6933        39
           1     0.7969    0.8361    0.8160        61

    accuracy                         0.7700       100
   macro avg     0.7595    0.7514    0.7547       100
weighted avg     0.7678    0.7700    0.7682       100



## Remove Negation Based

In [None]:
x_train_5 = x_train_dist
x_valid_5 = x_valid_dist
x_test_5 = x_test_dist

x_train_5 = np.append(x_train_5, x_train_tok, axis=1)
x_test_5 = np.append(x_test_5, x_test_tok, axis=1)

x_train_5 = np.append(x_train_5, x_train_pos, axis=1)
x_test_5 = np.append(x_test_5, x_test_pos, axis=1)

x_train_5 = np.append(x_train_5, x_train_bl, axis=1)
x_test_5 = np.append(x_test_5, x_test_bl, axis=1)

len(x_train_5[0])

20

### Training & Testing

In [None]:
# Logistic Regression
start = time.time()
logreg_model5 = LogisticRegression(random_state=9, max_iter=1000, solver='lbfgs', multi_class='auto').fit(x_train_5, y_train)
stop = time.time()
y_pred_logreg5 = logreg_model5.predict(x_test_5)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(logreg_model5))+ "")

Training Time: 0.13351225852966309
Size: 64


In [None]:
# SVM
start = time.time()
svm_model5 = svm.SVC(random_state=11, kernel = 'rbf').fit(x_train_5, y_train)
stop = time.time()
y_pred_svm5 = svm_model5.predict(x_test_5)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(svm_model5))+ "")

Training Time: 0.009552955627441406
Size: 64


In [None]:
# MLP
start = time.time()
mlp_model5 = MLPClassifier(random_state=13, max_iter=1000).fit(x_train_5, y_train)
stop = time.time()
y_pred_mlp5 = mlp_model5.predict(x_test_5)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(mlp_model5))+ "")

Training Time: 0.4472980499267578
Size: 64


### Evaluation

In [None]:
# Logistic Regression
print(classification_report(y_test, y_pred_logreg5, digits=4))

              precision    recall  f1-score   support

           0     0.8571    0.6154    0.7164        39
           1     0.7917    0.9344    0.8571        61

    accuracy                         0.8100       100
   macro avg     0.8244    0.7749    0.7868       100
weighted avg     0.8172    0.8100    0.8023       100



In [None]:
# SVM
print(classification_report(y_test, y_pred_svm5, digits=4))

              precision    recall  f1-score   support

           0     0.8889    0.6154    0.7273        39
           1     0.7945    0.9508    0.8657        61

    accuracy                         0.8200       100
   macro avg     0.8417    0.7831    0.7965       100
weighted avg     0.8313    0.8200    0.8117       100



In [None]:
# MLP
print(classification_report(y_test, y_pred_mlp5, digits=4))

              precision    recall  f1-score   support

           0     0.7812    0.6410    0.7042        39
           1     0.7941    0.8852    0.8372        61

    accuracy                         0.7900       100
   macro avg     0.7877    0.7631    0.7707       100
weighted avg     0.7891    0.7900    0.7853       100



## Remove BLEU Scores

In [None]:
x_train_6 = x_train_dist
x_valid_6 = x_valid_dist
x_test_6 = x_test_dist

x_train_6 = np.append(x_train_6, x_train_tok, axis=1)
x_valid_6 = np.append(x_valid_6, x_valid_tok, axis=1)
x_test_6 = np.append(x_test_6, x_test_tok, axis=1)

x_train_6 = np.append(x_train_6, x_train_pos, axis=1)
x_valid_6 = np.append(x_valid_6, x_valid_pos, axis=1)
x_test_6 = np.append(x_test_6, x_test_pos, axis=1)

x_train_6 = np.append(x_train_6, x_train_neg, axis=1)
x_valid_6 = np.append(x_valid_6, x_valid_neg, axis=1)
x_test_6 = np.append(x_test_6, x_test_neg, axis=1)

len(x_train_6[0])

31

### Training & Testing

In [None]:
# Logistic Regression
start = time.time()
logreg_model6 = LogisticRegression(random_state=9, max_iter=1000, solver='lbfgs', multi_class='auto').fit(x_train_6, y_train)
stop = time.time()
y_pred_logreg6 = logreg_model6.predict(x_test_6)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(logreg_model6))+ "")

Training Time: 0.18917489051818848
Size: 64


In [None]:
# SVM
start = time.time()
svm_model6 = svm.SVC(random_state=11, kernel = 'rbf').fit(x_train_6, y_train)
stop = time.time()
y_pred_svm6 = svm_model6.predict(x_test_6)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(svm_model6))+ "")

Training Time: 0.009670257568359375
Size: 64


In [None]:
# MLP
start = time.time()
mlp_model6 = MLPClassifier(random_state=13, max_iter=1000).fit(x_train_6, y_train)
stop = time.time()
y_pred_mlp6 = mlp_model6.predict(x_test_6)

print(f"Training Time: {stop - start}")
print("Size: "+str(sys.getsizeof(mlp_model6))+ "")

Training Time: 0.5541517734527588
Size: 64


### Evaluation

In [None]:
# Logistic Regression
print(classification_report(y_test, y_pred_logreg6, digits=4))

              precision    recall  f1-score   support

           0     0.8276    0.6154    0.7059        39
           1     0.7887    0.9180    0.8485        61

    accuracy                         0.8000       100
   macro avg     0.8082    0.7667    0.7772       100
weighted avg     0.8039    0.8000    0.7929       100



In [None]:
# SVM
print(classification_report(y_test, y_pred_svm6, digits=4))

              precision    recall  f1-score   support

           0     0.8889    0.6154    0.7273        39
           1     0.7945    0.9508    0.8657        61

    accuracy                         0.8200       100
   macro avg     0.8417    0.7831    0.7965       100
weighted avg     0.8313    0.8200    0.8117       100



In [None]:
# MLP
print(classification_report(y_test, y_pred_mlp6, digits=4))

              precision    recall  f1-score   support

           0     0.7812    0.6410    0.7042        39
           1     0.7941    0.8852    0.8372        61

    accuracy                         0.7900       100
   macro avg     0.7877    0.7631    0.7707       100
weighted avg     0.7891    0.7900    0.7853       100

