In [1]:
import os
import h5py
import time
import datetime
import warnings
import numpy as np
from nltk.util import usage

from featureExtraction.orthography.orthography import *

from classifier.knn.knn import *
from classifier.svm.svm import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [2]:
files_templated = os.listdir("../Dataset/Raw")
templated = [x for x in files_templated if (x.lower().endswith(".csv"))]
templated.sort()

DATA_TEMPLATED = ["../Dataset/Raw/" + x for x in files_templated if (x.lower().endswith(".csv"))]
DATA_TEMPLATED.sort()
len_templated = len(DATA_TEMPLATED)

In [3]:
warnings.filterwarnings("ignore")

### Decision Tree

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_TEMPLATED:
    try:
        print("Running {}/{} ({})".format(count, len_templated, templated[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_ortografi(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_templated))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.7390000000000001 (0.01814)
F1-Score: 0.7382151629079731 (0.01825)
FE Time: 0.08198380470275879 seconds
CL time: 3.176483392715454 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.55 (0.09198)
F1-Score: 0.5333638598373419 (0.10245)
FE Time: 0.01799941062927246 seconds
CL time: 1.4775559902191162 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5175000000000001 (0.12070)
F1-Score: 0.506601401295842 (0.12146)
FE Time: 0.013782739639282227 seconds
CL time: 0.15313506126403809 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5525000000000001 (0.10183)
F1-Score: 0.5401724100028885 (0.10106)
FE Time: 0.013083696365356445 seconds
CL time: 0.05421280860900879 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.54 (0.14892)
F1-Score: 0.5214075909470647 (0.14912)
FE T

### KNN

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_TEMPLATED:
    try:
        print("Running {}/{} ({})".format(count, len_templated, templated[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_ortografi(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_templated))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.75625 (0.02109)
F1-Score: 0.7542897837029782 (0.02136)
FE Time: 0.06911253929138184 seconds
CL time: 0.30179691314697266 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.58 (0.09165)
F1-Score: 0.5339885413986791 (0.10288)
FE Time: 0.01552581787109375 seconds
CL time: 0.1511213779449463 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5624999999999999 (0.10592)
F1-Score: 0.550454498527153 (0.10586)
FE Time: 0.024401187896728516 seconds
CL time: 0.13654756546020508 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5775 (0.10305)
F1-Score: 0.5654092679112365 (0.10181)
FE Time: 0.009982824325561523 seconds
CL time: 0.12284612655639648 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5233333333333332 (0.12342)
F1-Score: 0.5054067926397646 (0.12690)
FE Time: 0.

### Logistic Regression

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_TEMPLATED:
    try:
        print("Running {}/{} ({})".format(count, len_templated, templated[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_ortografi(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = logistic_regression(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_templated))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.7297499999999999 (0.02003)
F1-Score: 0.7286777767087915 (0.01980)
FE Time: 0.06772470474243164 seconds
CL time: 0.15278959274291992 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.632 (0.07222)
F1-Score: 0.5845416892987776 (0.08826)
FE Time: 0.02904534339904785 seconds
CL time: 0.10243010520935059 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5850000000000001 (0.13143)
F1-Score: 0.5733409171418662 (0.13033)
FE Time: 0.012587785720825195 seconds
CL time: 0.07755041122436523 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6 (0.09618)
F1-Score: 0.5887265639734849 (0.10007)
FE Time: 0.009798288345336914 seconds
CL time: 0.0825510025024414 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5199999999999999 (0.12754)
F1-Score: 0.4069953581252208 (0.11412)
F

### Naive Bayes

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_TEMPLATED:
    try:
        print("Running {}/{} ({})".format(count, len_templated, templated[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_ortografi(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_templated))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.72525 (0.02089)
F1-Score: 0.720285848069019 (0.01985)
FE Time: 0.06948184967041016 seconds
CL time: 0.07164406776428223 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5820000000000001 (0.10843)
F1-Score: 0.4792505658932429 (0.11908)
FE Time: 0.014797210693359375 seconds
CL time: 0.057359933853149414 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5425 (0.11861)
F1-Score: 0.44259939417378236 (0.11013)
FE Time: 0.013527154922485352 seconds
CL time: 0.05344033241271973 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6024999999999999 (0.09934)
F1-Score: 0.5890206572696982 (0.10347)
FE Time: 0.009850025177001953 seconds
CL time: 0.045304298400878906 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.47000000000000003 (0.14526)
F1-Score: 0.45296281032349145 

### Random Forest

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_TEMPLATED:
    try:
        print("Running {}/{} ({})".format(count, len_templated, templated[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_ortografi(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_templated))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.755 (0.01658)
F1-Score: 0.7534453650825476 (0.01709)
FE Time: 0.06781864166259766 seconds
CL time: 1.900665283203125 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.588 (0.07909)
F1-Score: 0.5542104589238221 (0.08616)
FE Time: 0.016968727111816406 seconds
CL time: 1.5985636711120605 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5549999999999999 (0.11391)
F1-Score: 0.5454001745158867 (0.11450)
FE Time: 0.028063058853149414 seconds
CL time: 0.8564565181732178 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6025 (0.09148)
F1-Score: 0.580507073691095 (0.10184)
FE Time: 0.009850740432739258 seconds
CL time: 0.9178240299224854 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.55 (0.13964)
F1-Score: 0.5269752364975477 (0.14539)
FE Time: 0.015120983123779297

### SVM

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_TEMPLATED:
    try:
        print("Running {}/{} ({})".format(count, len_templated, templated[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_ortografi(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_templated))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.7317500000000001 (0.02304)
F1-Score: 0.7298695988469477 (0.02326)
FE Time: 0.06713271141052246 seconds
CL time: 9.498273372650146 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6320000000000001 (0.08542)
F1-Score: 0.5883169856694577 (0.09442)
FE Time: 0.031336307525634766 seconds
CL time: 2.5278866291046143 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.585 (0.12359)
F1-Score: 0.5608824216887045 (0.12548)
FE Time: 0.02311253547668457 seconds
CL time: 4.323678731918335 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5824999999999999 (0.09121)
F1-Score: 0.5708053522358948 (0.09400)
FE Time: 0.016712665557861328 seconds
CL time: 0.5904309749603271 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5366666666666666 (0.11250)
F1-Score: 0.3455797949190398 (