In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np
import pandas as pd
from nltk.util import usage
from gensim.models import Word2Vec

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [2]:
files_clean = os.listdir("../Dataset/Clean")
clean = [x for x in files_clean if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in files_clean if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

### GENERATE MODEL

In [3]:
data = []
for i in DATA_CLEAN:
    df = pd.read_csv(i, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    for tweet in df["teks"].tolist():
        list_kata = tweet.split(" ")
        data.append(list_kata)
        
print("================================================")
start_time = time.time()
model_sg_hs_100 = Word2Vec(data, min_count=1, sg=1, window=5, hs=1, negative=0, vector_size=100)
model_sg_hs_100.save('./featureExtraction/word2vec/models_all/model_sg_hs_100.model')
end_time = time.time()
print("SG HS 100 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_sg_hs_200 = Word2Vec(data, min_count=1, sg=1, window=5, hs=1, negative=0, vector_size=200)
model_sg_hs_200.save('./featureExtraction/word2vec/models_all/model_sg_hs_200.model')
end_time = time.time()
print("SG HS 200 Time: {} seconds".format(end_time - start_time))
        
print("================================================")
start_time = time.time()
model_sg_hs_300 = Word2Vec(data, min_count=1, sg=1, window=5, hs=1, negative=0, vector_size=300)
model_sg_hs_300.save('./featureExtraction/word2vec/models_all/model_sg_hs_300.model')
end_time = time.time()
print("SG HS 300 Time: {} seconds".format(end_time - start_time))
        
print("================================================")
start_time = time.time()
model_sg_neg_100 = Word2Vec(data, min_count=1, sg=1, window=5, hs=0, negative=5, vector_size=100)
model_sg_neg_100.save('./featureExtraction/word2vec/models_all/model_sg_neg_100.model')
end_time = time.time()
print("SG NEG 100 Time: {} seconds".format(end_time - start_time))
        
print("================================================")
start_time = time.time()
model_sg_neg_200 = Word2Vec(data, min_count=1, sg=1, window=5, hs=0, negative=5, vector_size=200)
model_sg_neg_200.save('./featureExtraction/word2vec/models_all/model_sg_neg_200.model')
end_time = time.time()
print("SG NEG 200 Time: {} seconds".format(end_time - start_time))
        
print("================================================")
start_time = time.time()
model_sg_neg_300 = Word2Vec(data, min_count=1, sg=1, window=5, hs=0, negative=5, vector_size=300)
model_sg_neg_300.save('./featureExtraction/word2vec/models_all/model_sg_neg_300.model')
end_time = time.time()
print("SG NEG 300 Time: {} seconds".format(end_time - start_time))

SG HS 100 Time: 7.1879799365997314 seconds
SG HS 200 Time: 9.304147243499756 seconds
SG HS 300 Time: 11.845147132873535 seconds
SG NEG 100 Time: 4.624571084976196 seconds
SG NEG 200 Time: 6.0047478675842285 seconds
SG NEG 300 Time: 7.134796142578125 seconds


### Extract Features

In [4]:
def word2vec_sg(filepath, hs, neg, vector_size):
    df = pd.read_csv(filepath, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    model_path = './featureExtraction/word2vec/models_all/model_sg_{}_{}.model'.format('hs' if hs == 1 else 'neg', vector_size)
    model_sg = Word2Vec.load(model_path)

    word2vec_arr=[]
    for row in df['teks'].tolist():
        tweets = row.split(" ")
        row_mean_vector = (np.mean([model_sg.wv[terms] for terms in tweets], axis=0)).tolist()
        if not (type(row_mean_vector) is list):
            row_mean_vector = [float(0) for i in range(vector_size)]
        word2vec_arr.append(row_mean_vector)

    return np.array(word2vec_arr), df['label'].tolist()

### Runner with Classification

In [5]:
warnings.filterwarnings("ignore")

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i, 1, 0, 100)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9470000000000001 (0.01427)
F1-Score: 0.9468164759370626 (0.01429)
FE Time: 0.275501012802124 seconds
CL time: 2.862062454223633 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8324999999999999 (0.06205)
F1-Score: 0.8211920429626277 (0.06193)
FE Time: 0.11043310165405273 seconds
CL time: 1.067925214767456 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8688157894736843 (0.07199)
F1-Score: 0.8650009611393378 (0.07336)
FE Time: 0.11413455009460449 seconds
CL time: 0.06545209884643555 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8225000000000001 (0.07980)
F1-Score: 0.815030810999777 (0.08248)
FE Time: 0.08679986000061035 seconds
CL time: 0.06592416763305664 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7466666666666667 (0.10974)
F1-Score: 0.73272894

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i, 1, 0, 200)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9537500000000001 (0.01627)
F1-Score: 0.9535808380860203 (0.01631)
FE Time: 0.2944939136505127 seconds
CL time: 0.6493427753448486 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8304166666666667 (0.06867)
F1-Score: 0.8181303905460011 (0.06992)
FE Time: 0.10013055801391602 seconds
CL time: 0.11345410346984863 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8613157894736844 (0.08116)
F1-Score: 0.8572438971342592 (0.08369)
FE Time: 0.09610652923583984 seconds
CL time: 0.08610939979553223 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8400000000000001 (0.06819)
F1-Score: 0.833985521937568 (0.06974)
FE Time: 0.09188127517700195 seconds
CL time: 0.08523058891296387 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.78 (0.11633)
F1-Score: 0.7643877517335418 (

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i, 1, 0, 300)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.95325 (0.01645)
F1-Score: 0.9530704223359416 (0.01649)
FE Time: 0.4079017639160156 seconds
CL time: 0.8163094520568848 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8325833333333336 (0.07756)
F1-Score: 0.8216412491449396 (0.07888)
FE Time: 0.15973329544067383 seconds
CL time: 0.2758920192718506 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8636842105263158 (0.07736)
F1-Score: 0.8586123626582806 (0.08127)
FE Time: 0.1270442008972168 seconds
CL time: 0.1002500057220459 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8400000000000001 (0.07842)
F1-Score: 0.8328536537116932 (0.08244)
FE Time: 0.12235903739929199 seconds
CL time: 0.09226131439208984 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7704761904761905 (0.09563)
F1-Score: 0.7615486135699152 

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i, 0, 5, 100)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9459999999999997 (0.01393)
F1-Score: 0.9457542803068714 (0.01396)
FE Time: 0.43321967124938965 seconds
CL time: 0.45664143562316895 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8003333333333333 (0.08215)
F1-Score: 0.7860541191764125 (0.08231)
FE Time: 0.27982354164123535 seconds
CL time: 0.08150863647460938 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7934210526315789 (0.09874)
F1-Score: 0.7874932892334628 (0.10217)
FE Time: 0.27117323875427246 seconds
CL time: 0.07330608367919922 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7875 (0.08348)
F1-Score: 0.7750821053908379 (0.08854)
FE Time: 0.26992249488830566 seconds
CL time: 0.08134698867797852 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7292857142857143 (0.07543)
F1-Score: 0.7130816229693

In [10]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i, 0, 5, 200)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9404999999999999 (0.01739)
F1-Score: 0.940222251148561 (0.01740)
FE Time: 0.5169341564178467 seconds
CL time: 0.7680583000183105 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.77425 (0.07094)
F1-Score: 0.758925873473177 (0.06966)
FE Time: 0.27430224418640137 seconds
CL time: 0.12514281272888184 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8032894736842107 (0.09154)
F1-Score: 0.7962361351210723 (0.09425)
FE Time: 0.269428014755249 seconds
CL time: 0.08252573013305664 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.79 (0.07681)
F1-Score: 0.7789612900450933 (0.08183)
FE Time: 0.26481151580810547 seconds
CL time: 0.09375357627868652 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7397619047619048 (0.08560)
F1-Score: 0.7218635442047072 (0.08634)
FE Ti

In [11]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i, 0, 5, 300)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9437499999999999 (0.01377)
F1-Score: 0.9434984321975912 (0.01378)
FE Time: 0.541517972946167 seconds
CL time: 1.0180449485778809 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7902499999999999 (0.06843)
F1-Score: 0.7764324916667346 (0.06624)
FE Time: 0.3027646541595459 seconds
CL time: 0.30219030380249023 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7960526315789473 (0.10435)
F1-Score: 0.7891600856262487 (0.10954)
FE Time: 0.30448055267333984 seconds
CL time: 0.10854506492614746 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.775 (0.07500)
F1-Score: 0.7631708544732267 (0.07882)
FE Time: 0.34166717529296875 seconds
CL time: 0.11249351501464844 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7223809523809525 (0.10075)
F1-Score: 0.7040764738189524 (