In [1]:
import os
import h5py
import time
import datetime
import warnings
import numpy as np
from nltk.util import usage

from featureExtraction.pos.postag import *

from classifier.knn.knn import *
from classifier.svm.svm import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
files_clean = os.listdir("../Dataset/Clean")
clean = [x for x in files_clean if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in files_clean if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
warnings.filterwarnings("ignore")

### Decision Tree

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()

        # Feature Extraction
        result_fe, label = run_postag(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.5475 (0.04518)
F1-Score: 0.5418165518700222 (0.04567)
FE Time: 0.3095066547393799 seconds
CL time: 2.2659683227539062 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5828333333333335 (0.07726)
F1-Score: 0.3864599544169477 (0.05357)
FE Time: 0.0673835277557373 seconds
CL time: 1.649836540222168 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5071052631578947 (0.11886)
F1-Score: 0.49495237228073075 (0.12156)
FE Time: 0.06523251533508301 seconds
CL time: 0.04815340042114258 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.4025 (0.09284)
F1-Score: 0.3490641967648167 (0.10190)
FE Time: 0.03553318977355957 seconds
CL time: 0.04305577278137207 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5433333333333333 (0.09171)
F1-Score: 0.510899784956522 (0.10432)
FE 

### KNN

In [3]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()

        # Feature Extraction
        result_fe, label = run_postag(i)
        print("Completed Feature Extraction")
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Completed Feature Extraction
Accuracy: 0.49024999999999996 (0.03723)
F1-Score: 0.359570440922225 (0.04839)
FE Time: 0.38901376724243164 seconds
CL time: 2.7114827632904053 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Completed Feature Extraction
Accuracy: 0.5668333333333335 (0.10471)
F1-Score: 0.37009581386465074 (0.04432)
FE Time: 0.0672917366027832 seconds
CL time: 1.397386074066162 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Completed Feature Extraction
Accuracy: 0.5418421052631579 (0.11405)
F1-Score: 0.5033958971722803 (0.11081)
FE Time: 0.0732581615447998 seconds
CL time: 0.1365375518798828 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Completed Feature Extraction
Accuracy: 0.51 (0.10440)
F1-Score: 0.4233413322117263 (0.09478)
FE Time: 0.03905653953552246 seconds
CL time: 0.11942291259765625 seconds
Running 5/13 (templated_posneg_dataset_tw

### Logistic Regression

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()

        # Feature Extraction
        result_fe, label = run_postag(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = logistic_regression(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.5487499999999998 (0.04502)
F1-Score: 0.5430670405700899 (0.04551)
FE Time: 0.3917417526245117 seconds
CL time: 2.554812431335449 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5849166666666668 (0.08580)
F1-Score: 0.36720500829936 (0.03426)
FE Time: 0.06420159339904785 seconds
CL time: 1.5445725917816162 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5011842105263158 (0.09763)
F1-Score: 0.48267781131039983 (0.10357)
FE Time: 0.06491875648498535 seconds
CL time: 0.053611040115356445 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.3875 (0.09858)
F1-Score: 0.33826839476235016 (0.10458)
FE Time: 0.03560614585876465 seconds
CL time: 0.048681020736694336 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5302380952380953 (0.12259)
F1-Score: 0.481237099974329

### Naive Bayes

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()

        # Feature Extraction
        result_fe, label = run_postag(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.5309999999999999 (0.04288)
F1-Score: 0.4764127267250574 (0.04869)
FE Time: 0.3425314426422119 seconds
CL time: 0.06447529792785645 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5185000000000002 (0.10855)
F1-Score: 0.37268238523462466 (0.05336)
FE Time: 0.06492328643798828 seconds
CL time: 0.041109323501586914 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.4863157894736843 (0.09204)
F1-Score: 0.32645630620255206 (0.04099)
FE Time: 0.060562849044799805 seconds
CL time: 0.041188955307006836 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.4875 (0.11054)
F1-Score: 0.37258492918854175 (0.09303)
FE Time: 0.03478193283081055 seconds
CL time: 0.03747892379760742 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.46880952380952384 (0.13616)
F1-Score: 0.3245179

### Random Forest

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()

        # Feature Extraction
        result_fe, label = run_postag(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.548 (0.04493)
F1-Score: 0.5423092849664611 (0.04549)
FE Time: 0.3462026119232178 seconds
CL time: 1.154846429824829 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5788333333333334 (0.08134)
F1-Score: 0.37827578500019315 (0.05560)
FE Time: 0.12314629554748535 seconds
CL time: 0.9825313091278076 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5146052631578948 (0.10995)
F1-Score: 0.5037327522334085 (0.11341)
FE Time: 0.06171393394470215 seconds
CL time: 0.865943431854248 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.4 (0.08944)
F1-Score: 0.33639427579759795 (0.07949)
FE Time: 0.04565072059631348 seconds
CL time: 0.9279870986938477 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5702380952380952 (0.11279)
F1-Score: 0.53880477856972 (0.12661)
FE Time: 

### SVM

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()

        # Feature Extraction
        result_fe, label = run_postag(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.5377500000000001 (0.03659)
F1-Score: 0.473631248240962 (0.05555)
FE Time: 0.338533878326416 seconds
CL time: 2.348036527633667 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.5869166666666669 (0.08566)
F1-Score: 0.3680062903506421 (0.03424)
FE Time: 0.06547355651855469 seconds
CL time: 0.06646609306335449 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.46381578947368424 (0.09473)
F1-Score: 0.4071902322248472 (0.10107)
FE Time: 0.06041383743286133 seconds
CL time: 0.06249713897705078 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.3975000000000001 (0.07980)
F1-Score: 0.31288234637708795 (0.08113)
FE Time: 0.03477144241333008 seconds
CL time: 0.05995321273803711 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5671428571428571 (0.11274)
F1-Score: 0.5268