In [1]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np
from gensim.models import Word2Vec

from featureExtraction.tf.tf import *

from classifier.knn.knn import *
from classifier.svm.svm import *
from featureExtraction.tf.tf import *
from classifier.naiveBayes.naiveBayes import *
from classifier.randomForest.randomForest import *
from classifier.decisionTree.decisionTree import *
from classifier.logisticRegression.logisticRegression import *

In [2]:
clean = [x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in os.listdir("../Dataset/Clean") if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

In [3]:
def word2vec_sg(filepath):
    df = pd.read_csv(filepath, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    model_path = './featureExtraction/word2vec/models_all/model_sg_hs_300.model'
    model_sg = Word2Vec.load(model_path)

    word2vec_arr=[]
    for row in df['teks'].tolist():
        tweets = row.split(" ")
        row_mean_vector = (np.mean([model_sg.wv[terms] for terms in tweets], axis=0)).tolist()
        if not (type(row_mean_vector) is list):
            row_mean_vector = [float(0) for i in range(300)]
        word2vec_arr.append(row_mean_vector)

    return np.array(word2vec_arr), df['label'].tolist()

In [4]:
warnings.filterwarnings("ignore")

### Decision Tree

In [5]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = word2vec_sg(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = decision_tree(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.951 (0.01411)
F1-Score: 0.9507583702708583 (0.01421)
FE Time: 0.6008319854736328 seconds
CL time: 16.159909963607788 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7601666666666668 (0.05695)
F1-Score: 0.7469472084174777 (0.06716)
FE Time: 0.21167993545532227 seconds
CL time: 2.2036008834838867 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7482894736842105 (0.12131)
F1-Score: 0.73838858533942 (0.12222)
FE Time: 0.1730668544769287 seconds
CL time: 0.6029455661773682 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7150000000000001 (0.06344)
F1-Score: 0.7026486061717755 (0.06304)
FE Time: 0.16337800025939941 seconds
CL time: 0.4820544719696045 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6138095238095238 (0.12578)
F1-Score: 0.5929983503338766 (0.12

### KNN

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = word2vec_sg(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = knn(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9282499999999999 (0.02135)
F1-Score: 0.9278181811988391 (0.02138)
FE Time: 0.6330082416534424 seconds
CL time: 2.2275707721710205 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7340833333333332 (0.07829)
F1-Score: 0.725188307662807 (0.08010)
FE Time: 0.18150043487548828 seconds
CL time: 0.32995080947875977 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7506578947368422 (0.09949)
F1-Score: 0.7385310911581102 (0.10239)
FE Time: 0.17551159858703613 seconds
CL time: 0.4224531650543213 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6075 (0.11432)
F1-Score: 0.5283583476078938 (0.12000)
FE Time: 0.1535799503326416 seconds
CL time: 0.38835716247558594 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7192857142857142 (0.09873)
F1-Score: 0.6803192052559396 (

### Logistic Regression

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = word2vec_sg(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = logistic_regression(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9792500000000001 (0.00952)
F1-Score: 0.9791380276802085 (0.00960)
FE Time: 0.5945849418640137 seconds
CL time: 14.202066898345947 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.9090833333333332 (0.05549)
F1-Score: 0.9019297639210432 (0.05760)
FE Time: 0.19109010696411133 seconds
CL time: 0.6636083126068115 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.881842105263158 (0.06995)
F1-Score: 0.8766609479944385 (0.07421)
FE Time: 0.18593740463256836 seconds
CL time: 0.7149956226348877 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8674999999999999 (0.07462)
F1-Score: 0.8594817507608429 (0.08202)
FE Time: 0.15361690521240234 seconds
CL time: 0.38436317443847656 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8511904761904763 (0.09267)
F1-Score: 0.844008

### Naive bayes

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = word2vec_sg(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = naiveBayes(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.83725 (0.02416)
F1-Score: 0.835925753165007 (0.02408)
FE Time: 0.6054279804229736 seconds
CL time: 4.0621490478515625 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8048333333333334 (0.10113)
F1-Score: 0.795642415113624 (0.10542)
FE Time: 0.1596546173095703 seconds
CL time: 0.35913705825805664 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8467105263157896 (0.09299)
F1-Score: 0.8413677211597523 (0.09681)
FE Time: 0.15720820426940918 seconds
CL time: 0.37266039848327637 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8200000000000001 (0.06964)
F1-Score: 0.8050340963990902 (0.07607)
FE Time: 0.17529940605163574 seconds
CL time: 0.30585741996765137 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.7607142857142858 (0.12158)
F1-Score: 0.7442117341554514 

### Random Forest

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = word2vec_sg(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = random_forest(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.95325 (0.01408)
F1-Score: 0.953062739977887 (0.01417)
FE Time: 0.6278419494628906 seconds
CL time: 9.417161464691162 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8531666666666666 (0.06931)
F1-Score: 0.843889944667275 (0.07115)
FE Time: 0.16655373573303223 seconds
CL time: 1.1130294799804688 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.8436842105263158 (0.08263)
F1-Score: 0.8394540528252502 (0.08637)
FE Time: 0.1607532501220703 seconds
CL time: 1.052717924118042 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8 (0.07416)
F1-Score: 0.7901920305882579 (0.07614)
FE Time: 0.22467756271362305 seconds
CL time: 1.1545062065124512 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.783809523809524 (0.11371)
F1-Score: 0.7711887363040782 (0.11336)
FE Time: 0.

### SVM

In [10]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe1, label = term_freq(i)
        result_fe2, label = word2vec_sg(i)
        result_fe = np.concatenate((result_fe1, result_fe2), axis=1)
        fe_time = time.time()

        # classification
        acc_score, f1_score = svm(result_fe, label)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9744999999999997 (0.01150)
F1-Score: 0.9743691143649308 (0.01156)
FE Time: 0.623779296875 seconds
CL time: 95.03697681427002 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.8869999999999999 (0.05523)
F1-Score: 0.8773509921732459 (0.05976)
FE Time: 0.21221494674682617 seconds
CL time: 1.864196538925171 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.876578947368421 (0.07250)
F1-Score: 0.8706449122673042 (0.07989)
FE Time: 0.18017148971557617 seconds
CL time: 1.980961799621582 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.8675 (0.04815)
F1-Score: 0.859258305065833 (0.05269)
FE Time: 0.14254426956176758 seconds
CL time: 0.8485441207885742 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.8311904761904764 (0.10334)
F1-Score: 0.8199000390734137 (0.10801)
