In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np
from gensim.models import Word2Vec

from featureExtraction.lexicon.lexicon import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
clean = [x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
def word2vec_sg(filepath, hs, neg, vector_size):
    df = pd.read_csv(filepath, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    model_path = './featureExtraction/word2vec/models_all/model_sg_{}_{}.model'.format('hs' if hs == 1 else 'neg', vector_size)
    model_sg = Word2Vec.load(model_path)

    word2vec_arr=[]
    for row in df['teks'].tolist():
        tweets = row.split(" ")
        row_mean_vector = (np.mean([model_sg.wv[terms] for terms in tweets], axis=0)).tolist()
        if not (type(row_mean_vector) is list):
            row_mean_vector = [float(0) for i in range(vector_size)]
        word2vec_arr.append(row_mean_vector)

    return np.array(word2vec_arr), df['label'].tolist()

In [4]:
warnings.filterwarnings("ignore")

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = word2vec_sg(i, 1, 0, 300)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9540000000000001 (0.01420)
F1-Score: 0.9538081029701729 (0.01425)
FE Time: 3.7925074100494385 seconds
CL time: 5.2090229988098145 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8709999999999999 (0.07715)
F1-Score: 0.8613478610716788 (0.08249)
FE Time: 0.8371052742004395 seconds
CL time: 1.6382126808166504 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8667105263157895 (0.07777)
F1-Score: 0.8630364691469821 (0.08130)
FE Time: 0.8439080715179443 seconds
CL time: 0.20022964477539062 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.865 (0.05937)
F1-Score: 0.8572206795108096 (0.06353)
FE Time: 0.44586825370788574 seconds
CL time: 0.1783764362335205 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.786904761904762 (0.11729)
F1-Score: 0.7679301203873824 (0.1

### Decision Tree

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = word2vec_sg(i, 1, 0, 300)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.899 (0.02488)
F1-Score: 0.8985182100370821 (0.02508)
FE Time: 9.329880952835083 seconds
CL time: 15.27439022064209 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.76625 (0.07948)
F1-Score: 0.7540213722852298 (0.08133)
FE Time: 2.914085626602173 seconds
CL time: 6.583595514297485 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6978947368421052 (0.12101)
F1-Score: 0.6886975709270265 (0.12124)
FE Time: 2.133509874343872 seconds
CL time: 2.5511162281036377 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7500000000000002 (0.07583)
F1-Score: 0.7423146452777964 (0.07643)
FE Time: 1.2579758167266846 seconds
CL time: 0.9825408458709717 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7097619047619047 (0.14602)
F1-Score: 0.6951428351060704 (0.14822)
FE Time: 0.

### KNN

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = word2vec_sg(i, 1, 0, 300)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.8095000000000001 (0.02179)
F1-Score: 0.808694345228473 (0.02203)
FE Time: 10.628687143325806 seconds
CL time: 3.826681137084961 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8083333333333333 (0.07527)
F1-Score: 0.7930198094522894 (0.08677)
FE Time: 2.3286123275756836 seconds
CL time: 1.127993106842041 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7334210526315789 (0.10566)
F1-Score: 0.723625249527373 (0.11182)
FE Time: 2.085383415222168 seconds
CL time: 1.1314027309417725 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7699999999999999 (0.09798)
F1-Score: 0.7593174443431713 (0.10550)
FE Time: 0.962153434753418 seconds
CL time: 1.0440325736999512 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7095238095238096 (0.13925)
F1-Score: 0.690895170762666

### Naive Bayes

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = word2vec_sg(i, 1, 0, 300)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.8742500000000002 (0.02726)
F1-Score: 0.8728322683114097 (0.02762)
FE Time: 8.989170551300049 seconds
CL time: 0.870476245880127 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7984999999999999 (0.05770)
F1-Score: 0.7910829297844334 (0.05937)
FE Time: 2.4282147884368896 seconds
CL time: 0.2827003002166748 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7778947368421052 (0.12142)
F1-Score: 0.7695469287259906 (0.13099)
FE Time: 1.227238416671753 seconds
CL time: 0.31624293327331543 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.76 (0.08155)
F1-Score: 0.746204798019672 (0.08186)
FE Time: 0.8102066516876221 seconds
CL time: 0.6689190864562988 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7464285714285713 (0.12812)
F1-Score: 0.7349224142244737 (0.12427)

### Random Forest

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = word2vec_sg(i, 1, 0, 300)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9465000000000001 (0.01636)
F1-Score: 0.9463259933708918 (0.01637)
FE Time: 9.052068948745728 seconds
CL time: 11.600598096847534 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8289166666666666 (0.06129)
F1-Score: 0.8167859570872347 (0.06265)
FE Time: 2.174938440322876 seconds
CL time: 3.848158121109009 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8439473684210526 (0.08514)
F1-Score: 0.8399003130867276 (0.08705)
FE Time: 1.8903110027313232 seconds
CL time: 3.571244478225708 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8200000000000001 (0.06782)
F1-Score: 0.8107146202171371 (0.06911)
FE Time: 0.935147762298584 seconds
CL time: 3.4320764541625977 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7971428571428573 (0.11674)
F1-Score: 0.78120009745455

### SVM

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = word2vec_sg(i, 1, 0, 300)
        result_fe2, label, feat_name = run_lexiconCombined_tweet(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9605000000000002 (0.01413)
F1-Score: 0.9603371462989074 (0.01416)
FE Time: 7.069952964782715 seconds
CL time: 6.828821420669556 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8874166666666667 (0.07249)
F1-Score: 0.8793861301591844 (0.07634)
FE Time: 1.8349125385284424 seconds
CL time: 1.5401268005371094 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8892105263157895 (0.05399)
F1-Score: 0.8859759902643534 (0.05590)
FE Time: 2.0140390396118164 seconds
CL time: 2.6846907138824463 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8574999999999999 (0.05974)
F1-Score: 0.8484094320543877 (0.06546)
FE Time: 2.754469871520996 seconds
CL time: 1.265071153640747 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7871428571428571 (0.11059)
F1-Score: 0.7678972324489