In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np

from featureExtraction.tf.tf import *
from featureExtraction.lexicon.lexicon import *

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
files_clean = os.listdir("../Dataset/Clean")
clean = [x for x in files_clean if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in files_clean if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
warnings.filterwarnings("ignore")

###  Decision Tree

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.97325 (0.01207)
F1-Score: 0.9731290508914956 (0.01212)
FE Time: 4.364295721054077 seconds
CL time: 13.124659776687622 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.81075 (0.07554)
F1-Score: 0.794048254496792 (0.08577)
FE Time: 0.8557131290435791 seconds
CL time: 1.7651622295379639 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7385526315789473 (0.08064)
F1-Score: 0.727160197786813 (0.08743)
FE Time: 0.824704647064209 seconds
CL time: 0.3876051902770996 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8175000000000001 (0.08700)
F1-Score: 0.8075201813655003 (0.09462)
FE Time: 0.3699824810028076 seconds
CL time: 0.1338481903076172 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.753095238095238 (0.10327)
F1-Score: 0.7245715603887295 (0.11480)
FE Time: 

### KNN

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.84175 (0.02570)
F1-Score: 0.8406345703824358 (0.02609)
FE Time: 4.804242849349976 seconds
CL time: 7.999169111251831 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8105 (0.07802)
F1-Score: 0.7965270007549891 (0.08451)
FE Time: 0.9657540321350098 seconds
CL time: 0.4369974136352539 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7059210526315789 (0.09433)
F1-Score: 0.6944052205104836 (0.10140)
FE Time: 0.929149866104126 seconds
CL time: 1.6210567951202393 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7024999999999999 (0.07980)
F1-Score: 0.686015979595802 (0.08412)
FE Time: 0.43130993843078613 seconds
CL time: 1.328442096710205 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7126190476190475 (0.10557)
F1-Score: 0.6967621709795118 (0.11313)
FE Time: 

### Logistic Regression

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = logistic_regression(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9795 (0.00835)
F1-Score: 0.9793936427473271 (0.00838)
FE Time: 4.855054616928101 seconds
CL time: 58.25881338119507 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.9074166666666666 (0.06205)
F1-Score: 0.9003793456963027 (0.06610)
FE Time: 0.9817674160003662 seconds
CL time: 1.2968907356262207 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8717105263157896 (0.07183)
F1-Score: 0.8675405249074821 (0.07541)
FE Time: 0.8906748294830322 seconds
CL time: 1.654846429824829 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.885 (0.05500)
F1-Score: 0.8765730464299516 (0.06190)
FE Time: 0.39706993103027344 seconds
CL time: 0.7924349308013916 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8245238095238095 (0.09361)
F1-Score: 0.8098039429816082 (0.09743)
FE Time: 

### Naive Bayes

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.8442500000000001 (0.02276)
F1-Score: 0.8432537478709969 (0.02265)
FE Time: 4.576686859130859 seconds
CL time: 3.824176073074341 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8048333333333334 (0.10113)
F1-Score: 0.795642415113624 (0.10542)
FE Time: 0.9415302276611328 seconds
CL time: 0.34356164932250977 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8467105263157896 (0.09299)
F1-Score: 0.8413677211597523 (0.09681)
FE Time: 0.909574031829834 seconds
CL time: 0.3563988208770752 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8200000000000001 (0.06964)
F1-Score: 0.8050340963990902 (0.07607)
FE Time: 0.40496110916137695 seconds
CL time: 0.2802245616912842 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7607142857142858 (0.11597)
F1-Score: 0.74374039177

### Random Forest

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9782499999999998 (0.00810)
F1-Score: 0.9781433211354326 (0.00818)
FE Time: 4.623480319976807 seconds
CL time: 29.201092958450317 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.9033333333333333 (0.05271)
F1-Score: 0.8912148568490533 (0.06312)
FE Time: 0.9928708076477051 seconds
CL time: 1.1824116706848145 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8639473684210527 (0.05970)
F1-Score: 0.8575331520138526 (0.06194)
FE Time: 0.8931217193603516 seconds
CL time: 1.1155076026916504 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8675 (0.07628)
F1-Score: 0.8588966854547617 (0.08322)
FE Time: 0.4060976505279541 seconds
CL time: 1.0111539363861084 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8240476190476193 (0.08598)
F1-Score: 0.8001546606390125 (0.10

### SVM

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9732499999999998 (0.01228)
F1-Score: 0.9731097172194912 (0.01231)
FE Time: 4.651551723480225 seconds
CL time: 105.22592854499817 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.9031666666666667 (0.05491)
F1-Score: 0.8963309760127467 (0.05903)
FE Time: 0.9959704875946045 seconds
CL time: 1.0802743434906006 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.9019736842105261 (0.06634)
F1-Score: 0.8977883937845406 (0.07115)
FE Time: 0.9502184391021729 seconds
CL time: 1.6697101593017578 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8700000000000001 (0.06205)
F1-Score: 0.8611780238053628 (0.06890)
FE Time: 0.4363365173339844 seconds
CL time: 0.6703317165374756 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.840952380952381 (0.08642)
F1-Score: 0.82924185871