In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np

from featureExtraction.tf.tf import *
from featureExtraction.pos.postag import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
clean = [x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
warnings.filterwarnings("ignore")

### Logistic Regression

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = run_postag(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9787500000000001 (0.00850)
F1-Score: 0.9786409881172666 (0.00853)
FE Time: 0.44730615615844727 seconds
CL time: 15.431546926498413 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.9010000000000001 (0.05937)
F1-Score: 0.8921733218402592 (0.06219)
FE Time: 0.08005595207214355 seconds
CL time: 1.6820940971374512 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8768421052631581 (0.06072)
F1-Score: 0.8719580821468359 (0.06373)
FE Time: 0.07327938079833984 seconds
CL time: 0.6405124664306641 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8450000000000001 (0.06690)
F1-Score: 0.8348720728068775 (0.07404)
FE Time: 0.04596757888793945 seconds
CL time: 0.4196946620941162 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8345238095238097 (0.08953)
F1-Score: 0.82023

### Decision Tree

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = run_postag(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9755 (0.01139)
F1-Score: 0.9753869538280554 (0.01143)
FE Time: 0.5096139907836914 seconds
CL time: 12.551905632019043 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8247499999999999 (0.07517)
F1-Score: 0.8082863685740884 (0.08618)
FE Time: 0.09727358818054199 seconds
CL time: 2.004635810852051 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.763157894736842 (0.09109)
F1-Score: 0.7508710696668361 (0.08915)
FE Time: 0.08187246322631836 seconds
CL time: 0.5084311962127686 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7249999999999999 (0.12500)
F1-Score: 0.7113834313360309 (0.12631)
FE Time: 0.043851375579833984 seconds
CL time: 0.29366493225097656 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7471428571428571 (0.12065)
F1-Score: 0.7280410439233969 (

### KNN

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = run_postag(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9047500000000002 (0.01984)
F1-Score: 0.9039464368537387 (0.01976)
FE Time: 0.510148286819458 seconds
CL time: 6.837847471237183 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6953333333333334 (0.08239)
F1-Score: 0.6878492639677096 (0.08367)
FE Time: 0.09637641906738281 seconds
CL time: 1.3796477317810059 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6317105263157894 (0.15955)
F1-Score: 0.6199782610600688 (0.16420)
FE Time: 0.09678792953491211 seconds
CL time: 2.033745288848877 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.545 (0.14221)
F1-Score: 0.4250025222904402 (0.13557)
FE Time: 0.04660367965698242 seconds
CL time: 1.3841753005981445 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7052380952380954 (0.10454)
F1-Score: 0.6837189573412484 (0.11

### Naive Bayes

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = run_postag(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.83475 (0.02502)
F1-Score: 0.8334313731804166 (0.02489)
FE Time: 0.5073089599609375 seconds
CL time: 3.4255285263061523 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.80275 (0.10095)
F1-Score: 0.7934520129232218 (0.10509)
FE Time: 0.08710002899169922 seconds
CL time: 0.34117984771728516 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8467105263157896 (0.09299)
F1-Score: 0.8413677211597523 (0.09681)
FE Time: 0.08346748352050781 seconds
CL time: 0.353043794631958 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8200000000000001 (0.06964)
F1-Score: 0.8050340963990902 (0.07607)
FE Time: 0.04442238807678223 seconds
CL time: 0.2788581848144531 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7607142857142858 (0.11597)
F1-Score: 0.7437403917723444 (0.12161)
F

### Random Forest

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = run_postag(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9795000000000001 (0.00907)
F1-Score: 0.9794009349253194 (0.00912)
FE Time: 0.5557408332824707 seconds
CL time: 26.305203676223755 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8914166666666666 (0.06981)
F1-Score: 0.8805776549097581 (0.07610)
FE Time: 0.08820223808288574 seconds
CL time: 1.0927824974060059 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8492105263157894 (0.09056)
F1-Score: 0.8449413618735703 (0.09059)
FE Time: 0.08366179466247559 seconds
CL time: 1.1168861389160156 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.785 (0.07762)
F1-Score: 0.769068489992913 (0.08709)
FE Time: 0.04618644714355469 seconds
CL time: 1.0389816761016846 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8009523809523811 (0.07387)
F1-Score: 0.7824004182783459 (0.

### SVM

In [4]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = run_postag(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9727499999999999 (0.01078)
F1-Score: 0.9726100617417431 (0.01083)
FE Time: 0.48751187324523926 seconds
CL time: 93.9951982498169 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8789166666666668 (0.05982)
F1-Score: 0.8678508537690194 (0.06433)
FE Time: 0.09573149681091309 seconds
CL time: 2.1758673191070557 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8793421052631579 (0.07856)
F1-Score: 0.8737233156383108 (0.08423)
FE Time: 0.08268618583679199 seconds
CL time: 2.0494720935821533 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8275 (0.06610)
F1-Score: 0.8111046153966303 (0.07977)
FE Time: 0.048306941986083984 seconds
CL time: 0.9274873733520508 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8278571428571428 (0.08937)
F1-Score: 0.81627854859279 (0.