In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np

from featureExtraction.combination.ortho_w2v import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
clean = [x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
warnings.filterwarnings("ignore")

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in clean:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, features = ortho_w2v(i, 1, 0, 300)
        fe_time = time.time()

        # classification
        acc_score, f1_score = logistic_regression(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9567500000000001 (0.01846)
F1-Score: 0.9565849754875636 (0.01851)
FE Time: 0.4778780937194824 seconds
CL time: 7.522372245788574 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8250833333333334 (0.07773)
F1-Score: 0.8116000343956131 (0.08062)
FE Time: 0.1755211353302002 seconds
CL time: 2.462893009185791 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8535526315789476 (0.08128)
F1-Score: 0.848378419732404 (0.08421)
FE Time: 0.16297554969787598 seconds
CL time: 0.419811487197876 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8125 (0.07726)
F1-Score: 0.8034824606965303 (0.07896)
FE Time: 0.14565587043762207 seconds
CL time: 0.5793576240539551 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7602380952380953 (0.11534)
F1-Score: 0.7517493405605029 (0.111

### Decision Tree

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in clean:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, features = ortho_w2v(i, 1, 0, 300)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.90025 (0.02112)
F1-Score: 0.8997335199795641 (0.02128)
FE Time: 0.5350332260131836 seconds
CL time: 5.234397649765015 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6995833333333332 (0.08914)
F1-Score: 0.6832780916224955 (0.09755)
FE Time: 0.15780186653137207 seconds
CL time: 0.6005837917327881 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7276315789473684 (0.10656)
F1-Score: 0.7165421600853692 (0.10759)
FE Time: 0.16474270820617676 seconds
CL time: 0.29512929916381836 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7325000000000002 (0.09121)
F1-Score: 0.7166177887788352 (0.09423)
FE Time: 0.20431780815124512 seconds
CL time: 0.25359249114990234 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6247619047619046 (0.13143)
F1-Score: 0.6121455836419156

### KNN

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in clean:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, features = ortho_w2v(i, 1, 0, 300)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.84275 (0.02634)
F1-Score: 0.8411499806755891 (0.02679)
FE Time: 0.8856678009033203 seconds
CL time: 0.773266077041626 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5928333333333334 (0.10376)
F1-Score: 0.5591758841471955 (0.10950)
FE Time: 0.23598909378051758 seconds
CL time: 0.5212898254394531 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5668421052631578 (0.12872)
F1-Score: 0.5593437350029759 (0.13317)
FE Time: 0.2759990692138672 seconds
CL time: 0.36803269386291504 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5974999999999999 (0.10059)
F1-Score: 0.5835977004124635 (0.09936)
FE Time: 0.18567872047424316 seconds
CL time: 0.3870112895965576 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.4788095238095239 (0.13192)
F1-Score: 0.4558934166538656 (

### Naive Bayes

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in clean:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, features = ortho_w2v(i, 1, 0, 300)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.8755000000000001 (0.02872)
F1-Score: 0.8741729413680716 (0.02906)
FE Time: 0.5017485618591309 seconds
CL time: 0.5964875221252441 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7804999999999999 (0.06951)
F1-Score: 0.7721354053222582 (0.07063)
FE Time: 0.37186479568481445 seconds
CL time: 0.35067105293273926 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7803947368421053 (0.12236)
F1-Score: 0.7724877567915998 (0.13192)
FE Time: 0.16764521598815918 seconds
CL time: 0.13293051719665527 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7150000000000001 (0.09887)
F1-Score: 0.7019970856200816 (0.09668)
FE Time: 0.3697934150695801 seconds
CL time: 0.22814273834228516 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.709047619047619 (0.12327)
F1-Score: 0.69521

### Random Forest

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in clean:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, features = ortho_w2v(i, 1, 0, 300)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9487499999999999 (0.01949)
F1-Score: 0.948552480951237 (0.01955)
FE Time: 0.5010676383972168 seconds
CL time: 6.872772455215454 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8166666666666667 (0.06563)
F1-Score: 0.8043794751938931 (0.06770)
FE Time: 0.1572551727294922 seconds
CL time: 1.716315507888794 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8561842105263159 (0.08018)
F1-Score: 0.8515761660921001 (0.08309)
FE Time: 0.1621849536895752 seconds
CL time: 1.4385454654693604 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8125 (0.06495)
F1-Score: 0.8027143228129665 (0.06873)
FE Time: 0.33702945709228516 seconds
CL time: 1.4739351272583008 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7840476190476191 (0.08343)
F1-Score: 0.7718614396383443 (0.078

### SVM

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in clean:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, features = ortho_w2v(i, 1, 0, 300)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9637499999999999 (0.01823)
F1-Score: 0.9636035983357125 (0.01832)
FE Time: 0.5144708156585693 seconds
CL time: 12.57438349723816 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8510833333333332 (0.06680)
F1-Score: 0.8403121598919763 (0.07057)
FE Time: 0.20098042488098145 seconds
CL time: 8.551020622253418 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8588157894736843 (0.08224)
F1-Score: 0.8534940990363491 (0.08586)
FE Time: 0.18314647674560547 seconds
CL time: 12.494640827178955 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.835 (0.07089)
F1-Score: 0.8259857865423991 (0.07551)
FE Time: 0.208970308303833 seconds
CL time: 1.4478373527526855 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7907142857142857 (0.12326)
F1-Score: 0.7768788191027052 (0.123