In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np

from featureExtraction.lexicon.lexicon import *

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [2]:
files_clean = os.listdir("../Dataset/Clean")
clean = [x for x in files_clean if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in files_clean if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
warnings.filterwarnings("ignore")

### Decision Tree

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_lexiconCombined_tweet(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.5810000000000001 (0.02606)
F1-Score: 0.5798069376911948 (0.02585)
FE Time: 4.821568727493286 seconds
CL time: 3.430558681488037 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7960833333333334 (0.07487)
F1-Score: 0.7878102788372995 (0.07365)
FE Time: 1.083655595779419 seconds
CL time: 2.7199175357818604 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7046052631578947 (0.10300)
F1-Score: 0.6936166267245316 (0.10868)
FE Time: 1.5905275344848633 seconds
CL time: 0.06528091430664062 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.77 (0.09670)
F1-Score: 0.7544407047934583 (0.10801)
FE Time: 0.5800747871398926 seconds
CL time: 0.19677162170410156 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7466666666666666 (0.10830)
F1-Score: 0.7259354726744518 (0.1122

### KNN

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_lexiconCombined_tweet(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.6122500000000001 (0.03569)
F1-Score: 0.610445144204913 (0.03607)
FE Time: 7.276145935058594 seconds
CL time: 0.37271547317504883 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7940833333333333 (0.09797)
F1-Score: 0.7802699348947214 (0.10737)
FE Time: 1.012803077697754 seconds
CL time: 0.24273085594177246 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.718421052631579 (0.11042)
F1-Score: 0.7082007584772259 (0.11574)
FE Time: 0.9105672836303711 seconds
CL time: 0.21771574020385742 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7575000000000001 (0.10401)
F1-Score: 0.7478600402475473 (0.10690)
FE Time: 0.6663384437561035 seconds
CL time: 0.44191694259643555 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7366666666666667 (0.11149)
F1-Score: 0.716299054

### Logistic Regression

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_lexiconCombined_tweet(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = logistic_regression(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.63675 (0.02187)
F1-Score: 0.6352209450998888 (0.02178)
FE Time: 5.904351711273193 seconds
CL time: 0.1356492042541504 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7941666666666667 (0.09736)
F1-Score: 0.7776133543220731 (0.10979)
FE Time: 1.416621208190918 seconds
CL time: 0.4709625244140625 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7863157894736842 (0.08111)
F1-Score: 0.7792163633505336 (0.08546)
FE Time: 0.8973715305328369 seconds
CL time: 0.10468530654907227 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8100000000000002 (0.07176)
F1-Score: 0.8012180290071326 (0.07591)
FE Time: 0.4188563823699951 seconds
CL time: 0.07554984092712402 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7566666666666666 (0.12687)
F1-Score: 0.725836615572549 (0.1

### Naive Bayes

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_lexiconCombined_tweet(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.62575 (0.03561)
F1-Score: 0.6198780646279514 (0.03592)
FE Time: 5.1436238288879395 seconds
CL time: 0.09915041923522949 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.768 (0.09083)
F1-Score: 0.7596239351917763 (0.09411)
FE Time: 0.9016191959381104 seconds
CL time: 0.06264352798461914 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.773157894736842 (0.08088)
F1-Score: 0.7665503457150264 (0.08296)
FE Time: 0.9031403064727783 seconds
CL time: 0.0705718994140625 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7625 (0.08786)
F1-Score: 0.7488678380019744 (0.09073)
FE Time: 0.7499630451202393 seconds
CL time: 0.05825066566467285 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7428571428571429 (0.14407)
F1-Score: 0.7078005488065724 (0.16096)
FE Time: 0.24127

### Random Forest

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_lexiconCombined_tweet(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.6020000000000001 (0.03160)
F1-Score: 0.6007282983532038 (0.03204)
FE Time: 7.195547103881836 seconds
CL time: 13.09000849723816 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8244166666666668 (0.06443)
F1-Score: 0.8122209300265826 (0.06920)
FE Time: 0.9284203052520752 seconds
CL time: 1.5543668270111084 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7452631578947368 (0.09595)
F1-Score: 0.7353801746312347 (0.10124)
FE Time: 0.9444069862365723 seconds
CL time: 1.472623586654663 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8125 (0.09858)
F1-Score: 0.8007682731112533 (0.10635)
FE Time: 0.4222126007080078 seconds
CL time: 1.7639353275299072 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7673809523809524 (0.10369)
F1-Score: 0.7473124900125674 (0.1105

### SVM

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_lexiconCombined_tweet(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.6425000000000001 (0.02244)
F1-Score: 0.6387336679774884 (0.02193)
FE Time: 4.656360626220703 seconds
CL time: 10.215264081954956 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8065 (0.08884)
F1-Score: 0.7925424257444117 (0.09815)
FE Time: 1.1944034099578857 seconds
CL time: 0.1165931224822998 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7913157894736842 (0.08641)
F1-Score: 0.7858678810103464 (0.08903)
FE Time: 0.9177095890045166 seconds
CL time: 0.25054931640625 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7975000000000001 (0.05804)
F1-Score: 0.7874093012238456 (0.05956)
FE Time: 0.43445372581481934 seconds
CL time: 0.0689244270324707 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.746904761904762 (0.14077)
F1-Score: 0.7119120789341378 (0.1564