In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np
import pandas as pd
from nltk.util import usage
from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [2]:
files_clean = os.listdir("../Dataset/Clean")
clean = [x for x in files_clean if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in files_clean if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

### GENERATE MODEL

In [3]:
data = []
for i in DATA_CLEAN:
    df = pd.read_csv(i, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    for tweet in df["teks"].tolist():
        list_kata = tweet.split(" ")
        data.append(list_kata)

print("================================================")
start_time = time.time()
model_cbow_hs_100 = Word2Vec(data, min_count=1, window=5, hs=1, negative=0, vector_size=100)
model_cbow_hs_100.save('./featureExtraction/word2vec/models_all/model_cbow_hs_100.model')
end_time = time.time()
print("CBOW HS 100 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_cbow_hs_200 = Word2Vec(data, min_count=1, window=5, hs=1, negative=0, vector_size=200)
model_cbow_hs_200.save('./featureExtraction/word2vec/models_all/model_cbow_hs_200.model')
end_time = time.time()
print("CBOW HS 200 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_cbow_hs_300 = Word2Vec(data, min_count=1, window=5, hs=1, negative=0, vector_size=300)
model_cbow_hs_300.save('./featureExtraction/word2vec/models_all/model_cbow_hs_300.model')
end_time = time.time()
print("CBOW HS 300 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_cbow_neg_100 = Word2Vec(data, min_count=1, window=5, hs=0, negative=5, vector_size=100)
model_cbow_neg_100.save('./featureExtraction/word2vec/models_all/model_cbow_neg_100.model')
end_time = time.time()
print("CBOW NEG 100 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_cbow_neg_200 = Word2Vec(data, min_count=1, window=5, hs=0, negative=5, vector_size=200)
model_cbow_neg_200.save('./featureExtraction/word2vec/models_all/model_cbow_neg_200.model')
end_time = time.time()
print("CBOW NEG 200 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_cbow_neg_300 = Word2Vec(data, min_count=1, window=5, hs=0, negative=5, vector_size=300)
model_cbow_neg_300.save('./featureExtraction/word2vec/models_all/model_cbow_neg_300.model')
end_time = time.time()
print("CBOW NEG 300 Time: {} seconds".format(end_time - start_time))


CBOW HS 100 Time: 3.1479861736297607 seconds
CBOW HS 200 Time: 3.9886043071746826 seconds
CBOW HS 300 Time: 4.575922250747681 seconds
CBOW NEG 100 Time: 1.9059104919433594 seconds
CBOW NEG 200 Time: 2.3107385635375977 seconds
CBOW NEG 300 Time: 2.8063900470733643 seconds


### Extract Features

In [4]:
def word2vec_cbow(filepath, hs, neg, vector_size):
    df = pd.read_csv(filepath, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    model_path = './featureExtraction/word2vec/models_all/model_cbow_{}_{}.model'.format('hs' if hs == 1 else 'neg', vector_size)
    model_cbow = Word2Vec.load(model_path)

    word2vec_arr=[]
    for row in df['teks'].tolist():
        if isinstance(row, float):
            print(row)
        tweets = row.split(" ")
        row_mean_vector = (np.mean([model_cbow.wv[terms] for terms in tweets], axis=0)).tolist()
        if not (type(row_mean_vector) is list):
            row_mean_vector = [float(0) for i in range(vector_size)]
        word2vec_arr.append(row_mean_vector)

    return np.array(word2vec_arr), df['label'].tolist()

### Runner with Classification

In [5]:
warnings.filterwarnings("ignore")

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_cbow(i, 1, 0, 100)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.95175 (0.01287)
F1-Score: 0.9515607660337665 (0.01290)
FE Time: 0.28610682487487793 seconds
CL time: 2.755732536315918 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8100000000000002 (0.08207)
F1-Score: 0.7961341898684593 (0.08560)
FE Time: 0.10927081108093262 seconds
CL time: 1.2285830974578857 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7602631578947369 (0.09272)
F1-Score: 0.7518958845243338 (0.09863)
FE Time: 0.0946199893951416 seconds
CL time: 0.0690608024597168 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7725000000000002 (0.11233)
F1-Score: 0.7620470252104724 (0.11619)
FE Time: 0.08203506469726562 seconds
CL time: 0.07795023918151855 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6885714285714285 (0.13436)
F1-Score: 0.6642032988884853 

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_cbow(i, 1, 0, 200)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9525000000000002 (0.01124)
F1-Score: 0.9523138046833246 (0.01129)
FE Time: 0.2994675636291504 seconds
CL time: 0.894463062286377 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7959166666666666 (0.08170)
F1-Score: 0.7812430280222971 (0.08513)
FE Time: 0.11083674430847168 seconds
CL time: 0.11989164352416992 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7577631578947368 (0.09047)
F1-Score: 0.7476697739114955 (0.09780)
FE Time: 0.10005331039428711 seconds
CL time: 0.08875632286071777 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.76 (0.10198)
F1-Score: 0.7504222221622374 (0.10418)
FE Time: 0.08889436721801758 seconds
CL time: 0.09699749946594238 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7085714285714285 (0.11046)
F1-Score: 0.6881055227881853 (

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_cbow(i, 1, 0, 300)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.95275 (0.01101)
F1-Score: 0.9525735618085 (0.01104)
FE Time: 0.411468505859375 seconds
CL time: 1.148784875869751 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8061666666666666 (0.07123)
F1-Score: 0.7918053370215304 (0.07307)
FE Time: 0.11201143264770508 seconds
CL time: 0.3020644187927246 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7631578947368421 (0.09332)
F1-Score: 0.7535294140277146 (0.09793)
FE Time: 0.10788512229919434 seconds
CL time: 0.10289549827575684 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.75 (0.11937)
F1-Score: 0.7420028740908066 (0.12184)
FE Time: 0.0986793041229248 seconds
CL time: 0.11237502098083496 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7192857142857142 (0.10873)
F1-Score: 0.7001754583939266 (0.11213)
FE Time:

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_cbow(i, 0, 5, 100)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9292499999999999 (0.01599)
F1-Score: 0.928913066632882 (0.01607)
FE Time: 0.4471473693847656 seconds
CL time: 0.6416616439819336 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7542500000000001 (0.09006)
F1-Score: 0.7350704431225971 (0.09383)
FE Time: 0.2605452537536621 seconds
CL time: 0.10178589820861816 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6927631578947369 (0.07377)
F1-Score: 0.6816503489138601 (0.08212)
FE Time: 0.2743077278137207 seconds
CL time: 0.09070801734924316 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6675 (0.10639)
F1-Score: 0.6509799188636852 (0.11165)
FE Time: 0.26396846771240234 seconds
CL time: 0.07982516288757324 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6326190476190476 (0.10288)
F1-Score: 0.6014526252938219 (

In [10]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_cbow(i, 0, 5, 200)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9237500000000001 (0.01386)
F1-Score: 0.9234001656659571 (0.01405)
FE Time: 0.44563865661621094 seconds
CL time: 1.1734874248504639 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7540833333333332 (0.09461)
F1-Score: 0.7337304506478899 (0.09924)
FE Time: 0.2818734645843506 seconds
CL time: 0.13331818580627441 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6851315789473684 (0.09337)
F1-Score: 0.67408558808192 (0.10368)
FE Time: 0.27172231674194336 seconds
CL time: 0.08616757392883301 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6525 (0.13274)
F1-Score: 0.6437550858914098 (0.13570)
FE Time: 0.2819840908050537 seconds
CL time: 0.09499907493591309 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.619047619047619 (0.12069)
F1-Score: 0.5840891711095473 (0

In [11]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_cbow(i, 0, 5, 300)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9175000000000001 (0.01101)
F1-Score: 0.9171456040818095 (0.01116)
FE Time: 0.561453104019165 seconds
CL time: 1.6302902698516846 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7464999999999999 (0.08612)
F1-Score: 0.7215084160422456 (0.09102)
FE Time: 0.3016171455383301 seconds
CL time: 0.306288480758667 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6772368421052632 (0.10328)
F1-Score: 0.6646863311603518 (0.11501)
FE Time: 0.2871730327606201 seconds
CL time: 0.09895682334899902 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6500000000000001 (0.11832)
F1-Score: 0.6388929530857113 (0.12197)
FE Time: 0.2801370620727539 seconds
CL time: 0.10956501960754395 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6321428571428571 (0.11700)
F1-Score: 0.599629225