In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np
import pandas as pd
from nltk.util import usage
from gensim.models import Word2Vec

from classifier.knn.knn import *
from classifier.svm.svm import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
files_clean = os.listdir("../Dataset/Clean")
clean = [x for x in files_clean if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in files_clean if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

### Extract Features

In [3]:
def word2vec_sg(filepath):
    df = pd.read_csv(filepath, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    model_path = './featureExtraction/word2vec/models_all/model_sg_hs_300.model'
    model_sg = Word2Vec.load(model_path)

    word2vec_arr=[]
    for row in df['teks'].tolist():
        tweets = row.split(" ")
        row_mean_vector = (np.mean([model_sg.wv[terms] for terms in tweets], axis=0)).tolist()
        if not (type(row_mean_vector) is list):
            row_mean_vector = [float(0) for i in range(300)]
        word2vec_arr.append(row_mean_vector)

    return np.array(word2vec_arr), df['label'].tolist()

### Runner with Classification

In [4]:
warnings.filterwarnings("ignore")

### Decision Tree

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.8905000000000001 (0.02519)
F1-Score: 0.8901089562173492 (0.02528)
FE Time: 0.8395185470581055 seconds
CL time: 8.0062255859375 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.704 (0.08954)
F1-Score: 0.6869078043260968 (0.09860)
FE Time: 0.34045958518981934 seconds
CL time: 2.4895827770233154 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7502631578947367 (0.12598)
F1-Score: 0.7410591245780538 (0.12868)
FE Time: 0.19058775901794434 seconds
CL time: 0.31699538230895996 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7325 (0.08983)
F1-Score: 0.7215849141653926 (0.08822)
FE Time: 0.28033995628356934 seconds
CL time: 0.5363035202026367 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6038095238095238 (0.10520)
F1-Score: 0.5858286662747222 (0.10631)
FE Tim

### KNN

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9465 (0.01797)
F1-Score: 0.9462886172499383 (0.01803)
FE Time: 0.5043258666992188 seconds
CL time: 0.9464209079742432 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7926666666666666 (0.05002)
F1-Score: 0.7783565958884018 (0.05094)
FE Time: 0.358112096786499 seconds
CL time: 0.5703897476196289 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7906578947368421 (0.07796)
F1-Score: 0.7802925236624139 (0.08657)
FE Time: 0.37718725204467773 seconds
CL time: 0.4048643112182617 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.785 (0.08958)
F1-Score: 0.7691123171549425 (0.10030)
FE Time: 0.14903879165649414 seconds
CL time: 0.4714641571044922 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7223809523809523 (0.12692)
F1-Score: 0.7116277943276865 (0.12620)
FE Tim

### Logistic Regression

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = logistic_regression(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.95325 (0.01645)
F1-Score: 0.9530704223359416 (0.01649)
FE Time: 0.47136592864990234 seconds
CL time: 1.0642828941345215 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8325833333333336 (0.07756)
F1-Score: 0.8216412491449396 (0.07888)
FE Time: 0.13386321067810059 seconds
CL time: 0.19283533096313477 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8636842105263158 (0.07736)
F1-Score: 0.8586123626582806 (0.08127)
FE Time: 0.12994837760925293 seconds
CL time: 0.14671778678894043 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8400000000000001 (0.07842)
F1-Score: 0.8328536537116932 (0.08244)
FE Time: 0.11929869651794434 seconds
CL time: 0.17492055892944336 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7704761904761905 (0.09563)
F1-Score: 0.7615486135699

### Naive Bayes

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.87375 (0.02792)
F1-Score: 0.8722996589243582 (0.02838)
FE Time: 0.9052329063415527 seconds
CL time: 0.6103222370147705 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7825 (0.06831)
F1-Score: 0.7745884235596104 (0.06950)
FE Time: 0.30610132217407227 seconds
CL time: 0.29018473625183105 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7753947368421052 (0.12042)
F1-Score: 0.7671185528960802 (0.12998)
FE Time: 0.3554072380065918 seconds
CL time: 0.1286299228668213 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7225 (0.10059)
F1-Score: 0.7089020766023374 (0.09886)
FE Time: 0.331571102142334 seconds
CL time: 0.12316131591796875 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7128571428571429 (0.12429)
F1-Score: 0.7004893013277386 (0.11961)
FE Time: 0.3082

### Random Forest

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.945 (0.01994)
F1-Score: 0.9448114979309448 (0.01996)
FE Time: 0.4872915744781494 seconds
CL time: 9.046802043914795 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8368333333333332 (0.05596)
F1-Score: 0.8273481262986551 (0.05551)
FE Time: 0.1692500114440918 seconds
CL time: 1.9334402084350586 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8338157894736844 (0.07972)
F1-Score: 0.8286826104677093 (0.08337)
FE Time: 0.16712093353271484 seconds
CL time: 1.6082782745361328 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8074999999999999 (0.07790)
F1-Score: 0.7977105507758622 (0.08156)
FE Time: 0.1547386646270752 seconds
CL time: 1.7851049900054932 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8004761904761905 (0.10579)
F1-Score: 0.7837607538174574 (0.10

### SVM

In [10]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = word2vec_sg(i)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9625 (0.01427)
F1-Score: 0.9623386932244132 (0.01432)
FE Time: 0.4560236930847168 seconds
CL time: 6.192979335784912 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8610000000000001 (0.07214)
F1-Score: 0.8539561278814582 (0.07296)
FE Time: 0.15474867820739746 seconds
CL time: 0.37716126441955566 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8739473684210525 (0.06490)
F1-Score: 0.8685184697091245 (0.06923)
FE Time: 0.16634750366210938 seconds
CL time: 0.18605327606201172 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8400000000000001 (0.07348)
F1-Score: 0.8311761492553152 (0.08013)
FE Time: 0.14746689796447754 seconds
CL time: 0.169142484664917 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8047619047619048 (0.11326)
F1-Score: 0.7954239910733887 (