In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np

from featureExtraction.combination.ortho_lexicon import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
clean = [x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

templated = [x for x in os.listdir("../Dataset/Raw") if (x.lower().endswith(".csv"))]
templated.sort()

DATA_TEMPLATED = ["../Dataset/Raw/" + x for x in os.listdir("../Dataset/Raw") if (x.lower().endswith(".csv"))]
DATA_TEMPLATED.sort()
len_templated = len(DATA_TEMPLATED)

range_len = max(len_clean, len_templated)

In [3]:
warnings.filterwarnings("ignore")

### Logistic Regression

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.77825 (0.02531)
F1-Score: 0.7777248899748526 (0.02542)
FE Time: 3.407521963119507 seconds
CL time: 2.084932804107666 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8450833333333332 (0.09180)
F1-Score: 0.8332147924173864 (0.09845)
FE Time: 0.7264187335968018 seconds
CL time: 1.2601542472839355 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7939473684210526 (0.09664)
F1-Score: 0.7878638510612196 (0.10035)
FE Time: 0.7091686725616455 seconds
CL time: 0.1207268238067627 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8325000000000001 (0.07790)
F1-Score: 0.8218227246991938 (0.08566)
FE Time: 0.339353084564209 seconds
CL time: 0.12160563468933105 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.78 (0.12300)
F1-Score: 0.756723043683183 (0.12997)
FE Time: 0

### Decision Tree

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.7529999999999999 (0.03128)
F1-Score: 0.7520147052266621 (0.03141)
FE Time: 10.465201139450073 seconds
CL time: 11.124239206314087 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7542500000000001 (0.08464)
F1-Score: 0.7415707663964102 (0.08964)
FE Time: 0.9470345973968506 seconds
CL time: 2.405451774597168 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6878947368421051 (0.06125)
F1-Score: 0.6796557301861179 (0.06514)
FE Time: 0.9656224250793457 seconds
CL time: 0.09506392478942871 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7499999999999999 (0.07583)
F1-Score: 0.7380325452548328 (0.07739)
FE Time: 0.3815741539001465 seconds
CL time: 1.0273373126983643 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7257142857142858 (0.11904)
F1-Score: 0.707129700

### KNN

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.767 (0.02799)
F1-Score: 0.7656108673390014 (0.02825)
FE Time: 4.921276330947876 seconds
CL time: 3.9712071418762207 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6779166666666667 (0.10211)
F1-Score: 0.6475581300725698 (0.11156)
FE Time: 1.0665185451507568 seconds
CL time: 2.4175338745117188 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6552631578947368 (0.09133)
F1-Score: 0.6479583020608563 (0.09369)
FE Time: 0.8802728652954102 seconds
CL time: 0.19184541702270508 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7125 (0.10592)
F1-Score: 0.7014193207478143 (0.10872)
FE Time: 0.7074899673461914 seconds
CL time: 0.19530296325683594 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6616666666666665 (0.09616)
F1-Score: 0.6344615639778178 (0.10739)
FE Tim

### Naive Bayes

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.761 (0.03036)
F1-Score: 0.7590788078108611 (0.03072)
FE Time: 5.516570091247559 seconds
CL time: 0.08007192611694336 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7565 (0.09217)
F1-Score: 0.745048157280482 (0.09398)
FE Time: 1.4196758270263672 seconds
CL time: 0.058815717697143555 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6398684210526315 (0.06316)
F1-Score: 0.580699805586207 (0.09724)
FE Time: 1.5018010139465332 seconds
CL time: 0.055945634841918945 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.77 (0.07810)
F1-Score: 0.7580122811164101 (0.07816)
FE Time: 0.7019503116607666 seconds
CL time: 0.05233192443847656 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7430952380952381 (0.14346)
F1-Score: 0.7187409709145642 (0.14782)
FE Time: 0.3976566

### Random Forest

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.8130000000000001 (0.02648)
F1-Score: 0.8121923542156271 (0.02668)
FE Time: 5.053743600845337 seconds
CL time: 2.0803864002227783 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8144166666666667 (0.06688)
F1-Score: 0.8004972216872627 (0.07163)
FE Time: 0.987694501876831 seconds
CL time: 1.198204755783081 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7559210526315789 (0.08677)
F1-Score: 0.7500642464223533 (0.08947)
FE Time: 0.9284276962280273 seconds
CL time: 1.2648251056671143 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8225 (0.08871)
F1-Score: 0.8104582148425703 (0.09628)
FE Time: 0.48264217376708984 seconds
CL time: 1.2329480648040771 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7733333333333334 (0.09403)
F1-Score: 0.7476231106616421 (0.105

### SVM

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.77875 (0.02049)
F1-Score: 0.7780835972629606 (0.02090)
FE Time: 4.590083360671997 seconds
CL time: 11.533052682876587 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.835 (0.08600)
F1-Score: 0.8230610460078255 (0.09107)
FE Time: 0.8991131782531738 seconds
CL time: 2.762786388397217 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8113157894736842 (0.08611)
F1-Score: 0.8063315570868911 (0.08841)
FE Time: 1.5943224430084229 seconds
CL time: 3.6690633296966553 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8275 (0.08584)
F1-Score: 0.8183460552819536 (0.09255)
FE Time: 0.5837602615356445 seconds
CL time: 0.8511090278625488 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7771428571428571 (0.12047)
F1-Score: 0.7406965655435335 (0.14291)
FE Time: 0.242176532