In [1]:
import os
import h5py
import time
import datetime
import warnings
import numpy as np
from nltk.util import usage

from featureExtraction.orthography.orthography import *

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [2]:
files_templated = os.listdir("../Dataset/Raw")
templated = [x for x in files_templated if (x.lower().endswith(".csv"))]
templated.sort()

DATA_TEMPLATED = ["../Dataset/Raw/" + x for x in files_templated if (x.lower().endswith(".csv"))]
DATA_TEMPLATED.sort()
len_templated = len(DATA_TEMPLATED)

In [3]:
warnings.filterwarnings("ignore")

count = 1
fail = 0

f1_list = []
acc_list = []

In [4]:
for i in DATA_TEMPLATED:
    try:
        print("Running {}/{} ({})".format(count, len_templated, templated[count-1]))
        start_time = time.time()
        
        # Feature Extraction
        result_fe, label, feat_name = run_ortografi(i)
        fe_time = time.time()

        # prepare classifier
        cv = KFold(n_splits=20, random_state=1, shuffle=True)
        logRes = LogisticRegression(random_state=1, max_iter=10000)
        
        # prepare the cross-validation procedure
        acc_score = cross_val_score(logRes, result_fe, label, scoring='f1_micro', cv=cv, n_jobs=-1)
        f1_score = cross_val_score(logRes, result_fe, label, scoring='f1_macro', cv=cv, n_jobs=-1)
        cl_time = time.time()

        print('Accuracy: {} ({:.5f})'.format(np.mean(acc_score), np.std(acc_score)))
        print('F1-Score: {} ({:.5f})'.format(np.mean(f1_score), np.std(f1_score)))
        print('FE Time: {} seconds'.format(fe_time - start_time))
        print('CL time: {} seconds'.format(cl_time - fe_time))
        
        f1_list.append(np.mean(f1_score))
        acc_list.append(np.mean(acc_score))
        count += 1
    except Exception as e:
        count += 1
        fail += 1
        print(i, ": ", e)
        pass
    print("================================================")

print("================================================")
print("Overall Accuracy = {} ({})".format(sum(acc_list)/len(acc_list), np.std(acc_list)))
print("Overall F1-Score = {} ({})".format(sum(f1_list)/len(f1_list), np.std(f1_list)))
print("Failed = {}/{}".format(fail, len_templated))

Running 1/13 (templated_posneg_Dataset_C_HT_4000.csv)
Accuracy: 0.7297499999999999 (0.02003)
F1-Score: 0.7286777767087915 (0.01980)
FE Time: 0.058710336685180664 seconds
CL time: 2.067321300506592 seconds
Running 2/13 (templated_posneg_KicauanBerlabel.csv)
Accuracy: 0.632 (0.07222)
F1-Score: 0.5845416892987776 (0.08826)
FE Time: 0.01641249656677246 seconds
CL time: 1.054654598236084 seconds
Running 3/13 (templated_posneg_dataset_komentar_instagram_cyberbullying.csv)
Accuracy: 0.5850000000000001 (0.13143)
F1-Score: 0.5733409171418662 (0.13033)
FE Time: 0.011767387390136719 seconds
CL time: 0.061042070388793945 seconds
Running 4/13 (templated_posneg_dataset_tweet_sentimen_tayangan_tv.csv)
Accuracy: 0.6 (0.09618)
F1-Score: 0.5887265639734849 (0.10007)
FE Time: 0.008398771286010742 seconds
CL time: 0.05661821365356445 seconds
Running 5/13 (templated_posneg_dataset_tweet_sentiment_cellular_service_provider.csv)
Accuracy: 0.5199999999999999 (0.12754)
F1-Score: 0.4069953581252208 (0.11412)
FE