In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np

from featureExtraction.pos.postag import *
from featureExtraction.lexicon.lexicon import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
clean = [x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
warnings.filterwarnings("ignore")

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.6487499999999999 (0.02801)
F1-Score: 0.6471996859648425 (0.02757)
FE Time: 3.691763162612915 seconds
CL time: 2.012580633163452 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.79025 (0.08448)
F1-Score: 0.7697254019509882 (0.09886)
FE Time: 0.7518830299377441 seconds
CL time: 1.4319097995758057 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7914473684210526 (0.08472)
F1-Score: 0.7858553478093625 (0.08666)
FE Time: 0.7464559078216553 seconds
CL time: 0.06419754028320312 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8150000000000001 (0.07762)
F1-Score: 0.8076184948622019 (0.08168)
FE Time: 0.3769872188568115 seconds
CL time: 0.08225035667419434 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7395238095238096 (0.12529)
F1-Score: 0.7067582990325956 (0.

### Decision Tree

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.626 (0.02909)
F1-Score: 0.6247927878595851 (0.02909)
FE Time: 8.293925762176514 seconds
CL time: 5.5508105754852295 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.77575 (0.08725)
F1-Score: 0.7644943513278182 (0.09095)
FE Time: 1.5935900211334229 seconds
CL time: 5.102277755737305 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7077631578947366 (0.07372)
F1-Score: 0.6976755216486001 (0.07623)
FE Time: 1.5036542415618896 seconds
CL time: 1.4538118839263916 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7825 (0.07949)
F1-Score: 0.7681906308242891 (0.08769)
FE Time: 0.5744435787200928 seconds
CL time: 2.2522377967834473 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7707142857142858 (0.09464)
F1-Score: 0.7490300569313727 (0.10119)
FE Time: 0.605607748

### KNN

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.6220000000000001 (0.03172)
F1-Score: 0.6204842982489156 (0.03182)
FE Time: 7.49535870552063 seconds
CL time: 0.7444133758544922 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.79425 (0.07770)
F1-Score: 0.778880342113837 (0.08721)
FE Time: 0.9487624168395996 seconds
CL time: 0.19335126876831055 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7109210526315788 (0.09609)
F1-Score: 0.7019962368399909 (0.09995)
FE Time: 0.9087815284729004 seconds
CL time: 0.15555381774902344 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7775000000000001 (0.09010)
F1-Score: 0.7687480936545247 (0.09310)
FE Time: 0.4426727294921875 seconds
CL time: 0.21030974388122559 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7328571428571429 (0.13064)
F1-Score: 0.712081001085258 (0.1

### Naive Bayes

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.628 (0.03396)
F1-Score: 0.6193680832879811 (0.03472)
FE Time: 10.704725980758667 seconds
CL time: 0.18944811820983887 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7334166666666666 (0.11481)
F1-Score: 0.7110776714971324 (0.14481)
FE Time: 1.6444334983825684 seconds
CL time: 0.06126904487609863 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5214473684210528 (0.09539)
F1-Score: 0.3805341281252016 (0.09737)
FE Time: 1.676863670349121 seconds
CL time: 0.5208606719970703 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7200000000000001 (0.08276)
F1-Score: 0.7023391130719338 (0.08138)
FE Time: 0.9655625820159912 seconds
CL time: 0.45711231231689453 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5092857142857142 (0.14120)
F1-Score: 0.39858778754710233 (0

### Random Forest

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.642 (0.03760)
F1-Score: 0.6406188603372784 (0.03780)
FE Time: 10.034541845321655 seconds
CL time: 6.743590354919434 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7944166666666665 (0.08214)
F1-Score: 0.7788920125551773 (0.08567)
FE Time: 1.819450855255127 seconds
CL time: 2.591477632522583 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7557894736842105 (0.06916)
F1-Score: 0.7481454759469802 (0.07563)
FE Time: 2.000598430633545 seconds
CL time: 6.078983783721924 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8400000000000001 (0.08746)
F1-Score: 0.8314117557914334 (0.09334)
FE Time: 0.9047918319702148 seconds
CL time: 6.073939085006714 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.776904761904762 (0.10706)
F1-Score: 0.7527754146008789 (0.12408)
FE

### SVM

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.647 (0.02826)
F1-Score: 0.6445357873979338 (0.02756)
FE Time: 12.437442779541016 seconds
CL time: 27.58979368209839 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7923333333333333 (0.07761)
F1-Score: 0.7722007760324374 (0.09321)
FE Time: 1.3148958683013916 seconds
CL time: 3.828001022338867 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7938157894736843 (0.09290)
F1-Score: 0.7887790079325111 (0.09569)
FE Time: 1.9967560768127441 seconds
CL time: 0.4114418029785156 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7875 (0.07049)
F1-Score: 0.7781262776604848 (0.07263)
FE Time: 0.7001283168792725 seconds
CL time: 0.17658758163452148 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7602380952380952 (0.13229)
F1-Score: 0.724875924728866 (0.15243)
FE Time: 