In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np

from featureExtraction.combination.ortho_pos import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
clean = [x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

templated = [x for x in os.listdir("../Dataset/Raw") if (x.lower().endswith(".csv"))]
templated.sort()

DATA_TEMPLATED = ["../Dataset/Raw/" + x for x in os.listdir("../Dataset/Raw") if (x.lower().endswith(".csv"))]
DATA_TEMPLATED.sort()
len_templated = len(DATA_TEMPLATED)

range_len = max(len_clean, len_templated)

In [3]:
warnings.filterwarnings("ignore")

### Logistic Regression

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.72775 (0.02653)
F1-Score: 0.7267511268204638 (0.02634)
FE Time: 0.3435359001159668 seconds
CL time: 2.1920857429504395 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6373333333333334 (0.09029)
F1-Score: 0.5960334938264744 (0.09349)
FE Time: 0.07271003723144531 seconds
CL time: 1.1212151050567627 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.605263157894737 (0.11524)
F1-Score: 0.5934618159001825 (0.11902)
FE Time: 0.06687498092651367 seconds
CL time: 0.5334012508392334 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5775 (0.09549)
F1-Score: 0.566117765740713 (0.10144)
FE Time: 0.03881072998046875 seconds
CL time: 0.10080170631408691 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.554047619047619 (0.11953)
F1-Score: 0.5089184372988552 (0.12241)
FE T

### Decision Tree

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.71525 (0.02557)
F1-Score: 0.714350081607857 (0.02534)
FE Time: 0.6328558921813965 seconds
CL time: 3.2566158771514893 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5505833333333334 (0.07983)
F1-Score: 0.5213193479159305 (0.08860)
FE Time: 0.08509659767150879 seconds
CL time: 2.583143472671509 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5917105263157896 (0.08290)
F1-Score: 0.5790124691008727 (0.08006)
FE Time: 0.1803574562072754 seconds
CL time: 0.07962417602539062 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5925000000000001 (0.10871)
F1-Score: 0.5793467643262199 (0.10733)
FE Time: 0.09975647926330566 seconds
CL time: 0.0990457534790039 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.4828571428571428 (0.14510)
F1-Score: 0.4555883345707851 (0

### KNN

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.74875 (0.02207)
F1-Score: 0.7465629835964303 (0.02202)
FE Time: 0.35521888732910156 seconds
CL time: 0.31888484954833984 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5926666666666667 (0.10537)
F1-Score: 0.5586470415206893 (0.10904)
FE Time: 0.07324934005737305 seconds
CL time: 0.15944170951843262 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5593421052631579 (0.11571)
F1-Score: 0.5524739707877412 (0.11630)
FE Time: 0.068756103515625 seconds
CL time: 0.13672423362731934 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5925 (0.09523)
F1-Score: 0.5814051561146559 (0.09199)
FE Time: 0.04189872741699219 seconds
CL time: 0.13563942909240723 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.4895238095238096 (0.11253)
F1-Score: 0.46285340927562296 (0.11983

### Naive Bayes

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.718 (0.02926)
F1-Score: 0.7136690289509835 (0.02902)
FE Time: 0.44246506690979004 seconds
CL time: 3.2615911960601807 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.57675 (0.12630)
F1-Score: 0.4976983861058635 (0.12554)
FE Time: 0.08715605735778809 seconds
CL time: 2.0363714694976807 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5389473684210526 (0.07819)
F1-Score: 0.41617498774965406 (0.06303)
FE Time: 0.08150458335876465 seconds
CL time: 0.07317829132080078 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5925 (0.10639)
F1-Score: 0.5745727753237176 (0.10951)
FE Time: 0.04799771308898926 seconds
CL time: 0.0695500373840332 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.46547619047619043 (0.13860)
F1-Score: 0.3227321935292389 (0.08635)
FE Time: 0.

### Random Forest

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.7420000000000001 (0.02795)
F1-Score: 0.7407004247126991 (0.02798)
FE Time: 0.5941550731658936 seconds
CL time: 5.125384569168091 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5971666666666668 (0.10234)
F1-Score: 0.55951872737217 (0.11536)
FE Time: 0.11448955535888672 seconds
CL time: 3.591813564300537 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5818421052631579 (0.10830)
F1-Score: 0.5724256790643585 (0.11411)
FE Time: 0.10463786125183105 seconds
CL time: 1.0962600708007812 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5974999999999999 (0.10545)
F1-Score: 0.5833672788234135 (0.11115)
FE Time: 0.06322526931762695 seconds
CL time: 1.1740436553955078 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5266666666666666 (0.12994)
F1-Score: 0.4965734871

### SVM

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in range(range_len):
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = ortho_pos(clean[i])
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.7310000000000001 (0.02493)
F1-Score: 0.7289990376860526 (0.02524)
FE Time: 0.47568488121032715 seconds
CL time: 10.705082178115845 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6393333333333333 (0.08670)
F1-Score: 0.5991923499140326 (0.08995)
FE Time: 0.09943675994873047 seconds
CL time: 2.6514482498168945 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6103947368421052 (0.11709)
F1-Score: 0.5962657841832865 (0.11977)
FE Time: 0.09521150588989258 seconds
CL time: 3.4085159301757812 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5874999999999999 (0.09858)
F1-Score: 0.5766284167142292 (0.10141)
FE Time: 0.05012965202331543 seconds
CL time: 0.6985263824462891 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5235714285714286 (0.12457)
F1-Score: 0.45007