In [2]:
import os
import h5py
import time
import warnings
import datetime
import numpy as np
import pandas as pd
from nltk.util import usage
from gensim.models import FastText

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [3]:
files_clean = os.listdir("../Dataset/Clean")
clean = [x for x in files_clean if (x.lower().endswith(".csv"))]
clean.sort()

DATA_CLEAN = ["../Dataset/Clean/" + x for x in files_clean if (x.lower().endswith(".csv"))]
DATA_CLEAN.sort()
len_clean = len(DATA_CLEAN)

### GENERATE MODEL

In [3]:
data = []
for i in DATA_CLEAN:
    df = pd.read_csv(i, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    for tweet in df["teks"].tolist():
        list_kata = tweet.split(" ")
        data.append(list_kata)

print("================================================")
start_time = time.time()
model_cbow_100 = FastText(data, min_count=1, window=5, vector_size=100)
model_cbow_100.save('./featureExtraction/fasttext/models_all/model_cbow_100.model')
end_time = time.time()
print("CBOW 100 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_cbow_200 = FastText(data, min_count=1, window=5, vector_size=200)
model_cbow_200.save('./featureExtraction/fasttext/models_all/model_cbow_200.model')
end_time = time.time()
print("CBOW 200 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_cbow_300 = FastText(data, min_count=1, window=5, vector_size=300)
model_cbow_300.save('./featureExtraction/fasttext/models_all/model_cbow_300.model')
end_time = time.time()
print("CBOW 300 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_sg_100 = FastText(data, min_count=1, window=5, sg=1, vector_size=100)
model_sg_100.save('./featureExtraction/fasttext/models_all/model_sg_100.model')
end_time = time.time()
print("SG 100 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_sg_200 = FastText(data, min_count=1, window=5, sg=1, vector_size=200)
model_sg_200.save('./featureExtraction/fasttext/models_all/model_sg_200.model')
end_time = time.time()
print("SG 200 Time: {} seconds".format(end_time - start_time))

print("================================================")
start_time = time.time()
model_sg_300 = FastText(data, min_count=1, window=5, sg=1, vector_size=300)
model_sg_300.save('./featureExtraction/fasttext/models_all/model_sg_300.model')
end_time = time.time()
print("SG 300 Time: {} seconds".format(end_time - start_time))

CBOW 100 Time: 13.78688097000122 seconds
CBOW 200 Time: 22.667280912399292 seconds
CBOW 300 Time: 31.47254180908203 seconds
SG 100 Time: 16.155059576034546 seconds
SG 200 Time: 25.172243118286133 seconds
SG 300 Time: 34.90894317626953 seconds


### Extract Features

In [4]:
def fasttext(filepath, cbow, vector_size):
    df = pd.read_csv(filepath, delimiter=";", low_memory=False, header=0)
    df.dropna(axis=0, inplace=True)
    model_path = './featureExtraction/fasttext/models_all/model_{}_{}.model'.format('cbow' if cbow == 1 else 'sg', vector_size)
    model = FastText.load(model_path)

    fasttext_arr=[]
    for row in df['teks'].tolist():
        tweets = row.split(" ")
        row_mean_vector = (np.mean([model.wv[terms] for terms in tweets], axis=0)).tolist()
        if not (type(row_mean_vector) is list):
            row_mean_vector = [float(0) for i in range(vector_size)]
        fasttext_arr.append(row_mean_vector)

    return np.array(fasttext_arr), df['label'].tolist()

### Runner with Classification

In [5]:
warnings.filterwarnings("ignore")

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = fasttext(i, 1, 100)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.884 (0.02327)
F1-Score: 0.8836286766905289 (0.02341)
FE Time: 2.0284500122070312 seconds
CL time: 3.727189779281616 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7040833333333334 (0.09327)
F1-Score: 0.6791125235862782 (0.09369)
FE Time: 1.6839265823364258 seconds
CL time: 1.3252437114715576 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6698684210526317 (0.09775)
F1-Score: 0.6622029127283031 (0.09983)
FE Time: 1.5437488555908203 seconds
CL time: 0.11400008201599121 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.65 (0.08803)
F1-Score: 0.6343898799292023 (0.08883)
FE Time: 1.5251226425170898 seconds
CL time: 0.08194732666015625 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6385714285714285 (0.10138)
F1-Score: 0.6172372166876811 (0.10394)
FE Time:

In [7]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = fasttext(i, 1, 200)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.8707499999999999 (0.02141)
F1-Score: 0.8702715761174629 (0.02160)
FE Time: 2.2749826908111572 seconds
CL time: 1.540588617324829 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6738333333333332 (0.10405)
F1-Score: 0.641370307405482 (0.10332)
FE Time: 1.898468017578125 seconds
CL time: 0.1630692481994629 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6648684210526317 (0.07680)
F1-Score: 0.6565203529208612 (0.07952)
FE Time: 1.9077026844024658 seconds
CL time: 0.0914311408996582 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6499999999999999 (0.08944)
F1-Score: 0.6357408869695826 (0.08983)
FE Time: 1.9288592338562012 seconds
CL time: 0.1035315990447998 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6485714285714285 (0.11454)
F1-Score: 0.626583264087

In [8]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = fasttext(i, 1, 300)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.8652500000000002 (0.02112)
F1-Score: 0.8647581119438057 (0.02131)
FE Time: 2.5605292320251465 seconds
CL time: 1.877558946609497 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.6839166666666667 (0.09381)
F1-Score: 0.6537261160671803 (0.09206)
FE Time: 2.214425802230835 seconds
CL time: 0.332561731338501 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.6872368421052631 (0.08626)
F1-Score: 0.677888385333028 (0.08944)
FE Time: 2.3113155364990234 seconds
CL time: 0.12095904350280762 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6275000000000001 (0.10894)
F1-Score: 0.6128067641566095 (0.11116)
FE Time: 2.220977544784546 seconds
CL time: 0.1162419319152832 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6116666666666666 (0.10626)
F1-Score: 0.5896636838393

In [9]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = fasttext(i, 0, 100)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9215 (0.02068)
F1-Score: 0.9211872346590774 (0.02067)
FE Time: 4.88828182220459 seconds
CL time: 0.5541396141052246 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7785000000000001 (0.08871)
F1-Score: 0.7640348040393699 (0.08589)
FE Time: 1.5918693542480469 seconds
CL time: 0.10843610763549805 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7885526315789473 (0.09433)
F1-Score: 0.7821570502697673 (0.09920)
FE Time: 1.531951904296875 seconds
CL time: 0.08773493766784668 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7649999999999999 (0.09887)
F1-Score: 0.7484482360707052 (0.10656)
FE Time: 1.559178113937378 seconds
CL time: 0.08311772346496582 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6754761904761903 (0.10190)
F1-Score: 0.6573195438462467 (0.10

In [10]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = fasttext(i, 0, 200)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9247500000000001 (0.01984)
F1-Score: 0.9244556463330736 (0.01988)
FE Time: 7.846626043319702 seconds
CL time: 0.9717304706573486 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7863333333333334 (0.08502)
F1-Score: 0.7716697897281731 (0.08272)
FE Time: 1.9224433898925781 seconds
CL time: 0.14189815521240234 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7684210526315789 (0.09540)
F1-Score: 0.7633661758647119 (0.09944)
FE Time: 1.8662350177764893 seconds
CL time: 0.09313297271728516 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7625000000000001 (0.09203)
F1-Score: 0.7470691195269908 (0.09714)
FE Time: 1.9394652843475342 seconds
CL time: 0.10609221458435059 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6585714285714286 (0.12560)
F1-Score: 0.6367285

In [6]:
count = 1
fail = 0

f1_list = []
acc_list = []

for i in DATA_CLEAN:
    try:
        print("Running {}/{} ({})".format(count, len_clean, clean[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label = fasttext(i, 0, 300)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_clean))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.9262499999999999 (0.01987)
F1-Score: 0.9259578155960153 (0.01991)
FE Time: 2.810903787612915 seconds
CL time: 3.25498104095459 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.7845 (0.07145)
F1-Score: 0.7675849666578944 (0.07294)
FE Time: 2.697056293487549 seconds
CL time: 1.526374101638794 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.7782894736842105 (0.10110)
F1-Score: 0.7727149923277093 (0.10540)
FE Time: 2.4916555881500244 seconds
CL time: 0.12821578979492188 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.7474999999999999 (0.09010)
F1-Score: 0.7304692941754829 (0.09352)
FE Time: 2.3809573650360107 seconds
CL time: 0.12139534950256348 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.6590476190476189 (0.10574)
F1-Score: 0.6398311906961288 (0.1012