In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np
from gensim.models import Word2Vec

from featureExtraction.pos.postag import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
clean = [x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
def word2vec_sg(filepath, hs, neg, vector_size):
    df = pd.read_csv(filepath, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    model_path = './featureExtraction/word2vec/models_all/model_sg_{}_{}.model'.format('hs' if hs == 1 else 'neg', vector_size)
    model_sg = Word2Vec.load(model_path)

    word2vec_arr=[]
    for row in df['teks'].tolist():
        tweets = row.split(" ")
        row_mean_vector = (np.mean([model_sg.wv[terms] for terms in tweets], axis=0)).tolist()
        if not (type(row_mean_vector) is list):
            row_mean_vector = [float(0) for i in range(vector_size)]
        word2vec_arr.append(row_mean_vector)

    return np.array(word2vec_arr), df['label'].tolist()

In [4]:
warnings.filterwarnings("ignore")

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label = word2vec_sg(i, 1, 0, 300)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9557499999999998 (0.01477)
F1-Score: 0.9555818190730581 (0.01483)
FE Time: 0.6767807006835938 seconds
CL time: 2.7934324741363525 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8324166666666667 (0.06595)
F1-Score: 0.819533760959354 (0.06922)
FE Time: 0.21513700485229492 seconds
CL time: 1.606637716293335 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8688157894736843 (0.07407)
F1-Score: 0.8643757973230312 (0.07656)
FE Time: 0.18947267532348633 seconds
CL time: 0.11362838745117188 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8400000000000001 (0.07681)
F1-Score: 0.8331269512458098 (0.08076)
FE Time: 0.15558147430419922 seconds
CL time: 0.11304736137390137 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7700000000000001 (0.10535)
F1-Score: 0.755003

### Decision Tree

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label = word2vec_sg(i, 1, 0, 300)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.8935000000000001 (0.02056)
F1-Score: 0.8930908266474287 (0.02070)
FE Time: 1.9831104278564453 seconds
CL time: 13.210613250732422 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6938333333333333 (0.08266)
F1-Score: 0.6785564265708348 (0.08549)
FE Time: 0.5176265239715576 seconds
CL time: 5.220668077468872 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7198684210526316 (0.11709)
F1-Score: 0.7094012053689669 (0.11827)
FE Time: 0.33968591690063477 seconds
CL time: 0.9961934089660645 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.745 (0.07890)
F1-Score: 0.7322348015225927 (0.08050)
FE Time: 0.388683557510376 seconds
CL time: 0.6362056732177734 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6169047619047618 (0.12105)
F1-Score: 0.6005622109490396 (0.116

### KNN

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label = word2vec_sg(i, 1, 0, 300)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9380000000000001 (0.01427)
F1-Score: 0.937741222214191 (0.01426)
FE Time: 0.9054896831512451 seconds
CL time: 3.4062535762786865 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.73975 (0.07356)
F1-Score: 0.7264959319941416 (0.07538)
FE Time: 0.27251482009887695 seconds
CL time: 1.9741060733795166 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7580263157894735 (0.07445)
F1-Score: 0.7507885536712432 (0.07872)
FE Time: 0.37398672103881836 seconds
CL time: 0.31003642082214355 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7325000000000002 (0.10158)
F1-Score: 0.7103958282082167 (0.11293)
FE Time: 0.31185364723205566 seconds
CL time: 0.2975809574127197 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7026190476190476 (0.11158)
F1-Score: 0.6786851498604594 

### Naive Bayes

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label = word2vec_sg(i, 1, 0, 300)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.87375 (0.02761)
F1-Score: 0.872295421973576 (0.02804)
FE Time: 0.8972256183624268 seconds
CL time: 0.327686071395874 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7765833333333332 (0.07030)
F1-Score: 0.7688085118726523 (0.07121)
FE Time: 0.23540496826171875 seconds
CL time: 0.07499527931213379 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7778947368421054 (0.12346)
F1-Score: 0.7707971958158053 (0.13263)
FE Time: 0.2182919979095459 seconds
CL time: 0.09039425849914551 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7225 (0.10059)
F1-Score: 0.7089020766023374 (0.09886)
FE Time: 0.23253607749938965 seconds
CL time: 0.10123920440673828 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6814285714285713 (0.12288)
F1-Score: 0.6656244486507644 (0.12500)
FE

### Random Forest

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label = word2vec_sg(i, 1, 0, 300)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.94825 (0.01748)
F1-Score: 0.9480642527462461 (0.01751)
FE Time: 1.5139796733856201 seconds
CL time: 8.777558326721191 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8391666666666666 (0.07432)
F1-Score: 0.827412231563855 (0.07792)
FE Time: 0.22748637199401855 seconds
CL time: 1.9264943599700928 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8463157894736844 (0.08153)
F1-Score: 0.840541747838019 (0.08624)
FE Time: 0.19717836380004883 seconds
CL time: 1.7359108924865723 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8049999999999999 (0.07228)
F1-Score: 0.7959008183098067 (0.07465)
FE Time: 0.21666240692138672 seconds
CL time: 1.733647108078003 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7940476190476191 (0.09177)
F1-Score: 0.7812080560422362 (0.0

### SVM

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = run_postag(i)
        result_fe2, label = word2vec_sg(i, 1, 0, 300)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9629999999999999 (0.01600)
F1-Score: 0.962843824478027 (0.01604)
FE Time: 1.4144701957702637 seconds
CL time: 6.980316638946533 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8608333333333335 (0.06293)
F1-Score: 0.8494571185448114 (0.06621)
FE Time: 0.24542903900146484 seconds
CL time: 1.8815159797668457 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8638157894736842 (0.07176)
F1-Score: 0.8593127609442053 (0.07415)
FE Time: 0.22450780868530273 seconds
CL time: 0.6222209930419922 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8324999999999999 (0.08258)
F1-Score: 0.8233659386976624 (0.08945)
FE Time: 0.18703460693359375 seconds
CL time: 0.13859081268310547 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7902380952380953 (0.11856)
F1-Score: 0.7747308