In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [2]:
files_clean = os.listdir("../Dataset/Clean")
clean = [x for x in files_clean if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in files_clean if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
warnings.filterwarnings("ignore")

### Decision Tree

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = term_freq(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9749999999999999 (0.01095)
F1-Score: 0.9748832039762592 (0.01101)
FE Time: 0.162672758102417 seconds
CL time: 14.803902864456177 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8266666666666665 (0.06485)
F1-Score: 0.8125273266263827 (0.07267)
FE Time: 0.03606295585632324 seconds
CL time: 2.098287582397461 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7710526315789474 (0.07117)
F1-Score: 0.7591074936826384 (0.07595)
FE Time: 0.027743816375732422 seconds
CL time: 0.546647310256958 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7375 (0.10232)
F1-Score: 0.7284763939558496 (0.10459)
FE Time: 0.014698982238769531 seconds
CL time: 0.3214571475982666 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7471428571428572 (0.11634)
F1-Score: 0.7282032791340068 (0

### KNN

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = term_freq(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9125 (0.01881)
F1-Score: 0.9118564477493903 (0.01875)
FE Time: 0.145341157913208 seconds
CL time: 8.546363353729248 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6917500000000001 (0.07669)
F1-Score: 0.6803972236123099 (0.07918)
FE Time: 0.028119564056396484 seconds
CL time: 0.727942943572998 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6673684210526316 (0.09232)
F1-Score: 0.6549584743340295 (0.09522)
FE Time: 0.024075031280517578 seconds
CL time: 2.009338140487671 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.5425 (0.13535)
F1-Score: 0.4235018374151531 (0.12122)
FE Time: 0.01813030242919922 seconds
CL time: 1.7222261428833008 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6747619047619047 (0.13125)
F1-Score: 0.6529863757617325 (0.13659)
FE Ti

### Logistic Regression

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = term_freq(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = logistic_regression(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9785 (0.00896)
F1-Score: 0.9783920018741522 (0.00900)
FE Time: 0.12827634811401367 seconds
CL time: 13.84012246131897 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.9010000000000001 (0.06454)
F1-Score: 0.8921002253175733 (0.06715)
FE Time: 0.03571438789367676 seconds
CL time: 1.0486109256744385 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8743421052631579 (0.06473)
F1-Score: 0.8689691880000467 (0.06914)
FE Time: 0.02031564712524414 seconds
CL time: 0.6175801753997803 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8450000000000001 (0.09605)
F1-Score: 0.8351115170664961 (0.10224)
FE Time: 0.018467187881469727 seconds
CL time: 0.4137842655181885 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.840952380952381 (0.09521)
F1-Score: 0.829193566478615 (0

### Naive Bayes

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = term_freq(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.83475 (0.02502)
F1-Score: 0.8334313731804166 (0.02489)
FE Time: 0.14106249809265137 seconds
CL time: 3.6153008937835693 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8048333333333334 (0.10113)
F1-Score: 0.795642415113624 (0.10542)
FE Time: 0.022518396377563477 seconds
CL time: 0.2577211856842041 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8467105263157896 (0.09299)
F1-Score: 0.8413677211597523 (0.09681)
FE Time: 0.024278640747070312 seconds
CL time: 0.3808720111846924 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8200000000000001 (0.06964)
F1-Score: 0.8050340963990902 (0.07607)
FE Time: 0.013020515441894531 seconds
CL time: 0.23278570175170898 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7607142857142858 (0.11597)
F1-Score: 0.7437403917723

### Random Forest

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = term_freq(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9817500000000001 (0.01087)
F1-Score: 0.9816621205115006 (0.01096)
FE Time: 0.12281513214111328 seconds
CL time: 33.553619146347046 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8993333333333332 (0.06317)
F1-Score: 0.890469253019754 (0.06683)
FE Time: 0.026356935501098633 seconds
CL time: 1.4581222534179688 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8264473684210525 (0.06949)
F1-Score: 0.8208095166731088 (0.06967)
FE Time: 0.024736404418945312 seconds
CL time: 1.477386713027954 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7925 (0.08842)
F1-Score: 0.7765272689886702 (0.09830)
FE Time: 0.011802196502685547 seconds
CL time: 1.5086894035339355 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8142857142857144 (0.06314)
F1-Score: 0.7936058172395499

### SVM

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = term_freq(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9734999999999998 (0.01216)
F1-Score: 0.9733516782354558 (0.01225)
FE Time: 0.17994046211242676 seconds
CL time: 101.98016238212585 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8829166666666668 (0.05264)
F1-Score: 0.8713621044116376 (0.06159)
FE Time: 0.0363922119140625 seconds
CL time: 1.5510585308074951 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8767105263157896 (0.07013)
F1-Score: 0.8708898429870633 (0.07591)
FE Time: 0.026862621307373047 seconds
CL time: 2.1907541751861572 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8450000000000001 (0.06103)
F1-Score: 0.8314037212594219 (0.07218)
FE Time: 0.012577056884765625 seconds
CL time: 0.8218176364898682 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8307142857142857 (0.10077)
F1-Score: 0.8203