In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import re
from itertools import compress
import time
import random
import math

# Split json file

In [None]:
df = pd.read_json("../data/debatepedia/debatepedia-preprocessed.json", orient = "records")
df

### Drop duplicates

In [None]:
df =  df.drop_duplicates(subset="content",keep="first")
df

### Split by topic

In [None]:
topics = df.topic.unique()
n = len(topics)
n

In [None]:
n_t = math.floor(n*0.8)
n_t

In [None]:
topics_tr = topics[:n_t]
topics_tr

In [None]:
topics_te = topics[n_t:]
topics_te

In [None]:
df_tr = df[df['topic'].isin(topics_tr)]
df_tr

In [None]:
df_te = df[df['topic'].isin(topics_te)]
df_te

In [None]:
df_tr.to_json("../data/debatepedia/k-fold/set1/debatepedia-preprocessed-train.json", orient = "records")
df_te.to_json("../data/debatepedia/k-fold/set1/debatepedia-preprocessed-test.json", orient = "records")

# Split kfold

In [None]:
from pathlib import Path
import shutil
from datetime import datetime
import os

In [None]:
def split_kfold(num_fold,index,split_path):
    """
    Split folders into 80% training and 20% testing based on index
    """
    # walk through folders
    folders = []
    for entry in os.scandir(path):
        if entry.is_dir():
            folders.append(entry.path)
    for f in folders:
        print(f)
    test_ratio = 1/num_fold
    test_len = int(len(folders) * test_ratio)
    t1= int(len(folders) * test_ratio * (index-1))
    t2 = t1+test_len
    # split test set according to t1 and t2
    train = folders[:t1] + folders[t2:]
    test = folders[t1:t2]
    parent = os.path.join(split_path,"set"+str(index))
    
    # create train path and test path
    train_path = os.path.join(parent, "train")
    test_path = os.path.join(parent, "test")
    print("train path: " + train_path)
    print("test path: " + test_path)

    # copy the folders to train path or test path
    # according to the split

    print(len(train))
    print(len(test))
    for f in train:
        folder_name = f.split("/")[-1]
        write_path = os.path.join(train_path, folder_name)
        copy_folder(f, write_path)
    for f in test:
        folder_name = f.split("/")[-1]
        write_path = os.path.join(test_path, folder_name)
        copy_folder(f, write_path)

In [None]:
def copy_folder(src, des):
    print(src)
    print(des)
    try:
        shutil.copytree(src, des)
        # Directories are the same
    except shutil.Error as e:
        print('Directory not copied. Error: %s' % e)
        # Any error saying that the directory doesn't exist
    except OSError as e:
        print('Directory not copied. Error: %s' % e)

In [None]:
path="../data/debatepedia/xmi"
num_fold = 5
#### Write to files
t = datetime.now()
t

In [None]:
dt = str(t)[:19].replace(' ', '_')
parent = str(Path(path).parent)
split_path = os.path.join(parent, dt)
if not os.path.exists(split_path):
    os.mkdir(split_path)
print(split_path)
for i in range(1,num_fold+1):
    split_kfold(num_fold,i,split_path)

# Scripts to run the whole pipeline 
from reading json file, generating xmi, splitting, generating arff to evaluation by Weka

In [None]:
# scripts/adu_classification.sh

# ADU Classification
#./scripts/adu_classification.sh >&1 | tee  "output/$(date +"%Y-%m-%d_%T").log"

# ADU 5 fold Validation
#./scripts/adu_classification.sh kfold >&1 | tee  "output/$(date +"%Y-%m-%d_%T").log"

# ADU Random Split Classification
#./scripts/adu_classification.sh random >&1 | tee  "output/$(date +"%Y-%m-%d_%T").log"

# ADU classification

In [None]:
### output from output/2019-05-31_03:42:29.log

Step 1: Split File into Training & Testing
/home/ciso0478/wstud-visit-the-dome-ss19/data/debatepedia/2019-05-31_03:42:29
....ok

Step 2: Use UIMA to convert to XMI files
input directory: /home/ciso0478/wstud-visit-the-dome-ss19/data/debatepedia/2019-05-31_03:42:29
filename: debatepedia-preprocessed_train.json
output directory: /home/ciso0478/wstud-visit-the-dome-ss19/data/debatepedia/2019-05-31_03:42:29/xmi/debatepedia-preprocessed_train
.....................................................................................................................................................................................................................................................................................................................................................................................................filename: debatepedia-preprocessed_test.json
output directory: /home/ciso0478/wstud-visit-the-dome-ss19/data/debatepedia/2019-05-31_03:42:29/xmi/debatepedia-preprocessed_test
...........................................................................done
....ok

Step 3: Generate Feature Files
---------------------------------------------
Processing corpus in the directory 
/home/ciso0478/wstud-visit-the-dome-ss19/data/debatepedia/2019-05-31_03:42:29/xmi/debatepedia-preprocessed_train
---------------------------------------------

Compute feature values on /home/ciso0478/wstud-visit-the-dome-ss19/data/debatepedia/2019-05-31_03:42:29/xmi/debatepedia-preprocessed_train
finished in 21.751s

---------------------------------------------
Processing corpus in the directory 
/home/ciso0478/wstud-visit-the-dome-ss19/data/debatepedia/2019-05-31_03:42:29/xmi/debatepedia-preprocessed_test
---------------------------------------------

Compute feature values on /home/ciso0478/wstud-visit-the-dome-ss19/data/debatepedia/2019-05-31_03:42:29/xmi/debatepedia-preprocessed_test
finished in 4.205s

....ok

Step 4: Use Weka to train classifier
/home/ciso0478/wstud-visit-the-dome-ss19/data/debatepedia/2019-05-31_03:42:29

Time taken to test model on training data: 18.36 seconds

=== Error on training data ===

Correctly Classified Instances       58624               99.9966 %
Incorrectly Classified Instances         2                0.0034 %
Kappa statistic                          0.9999
Mean absolute error                      0.0274
Root mean squared error                  0.0576
Relative absolute error                  5.4715 %
Root relative squared error             11.5226 %
Total Number of Instances            58626     


=== Detailed Accuracy By Class ===

                 TP Rate  FP Rate  Precision  Recall   F-Measure  MCC      ROC Area  PRC Area  Class
                 1.000    0.000    1.000      1.000    1.000      1.000    1.000     1.000     conclusion
                 1.000    0.000    1.000      1.000    1.000      1.000    1.000     1.000     premise
Weighted Avg.    1.000    0.000    1.000      1.000    1.000      1.000    1.000     1.000     


=== Confusion Matrix ===

     a     b   <-- classified as
 29313     0 |     a = conclusion
     2 29311 |     b = premise

Time taken to test model on test data: 2.6 seconds

=== Error on test data ===

Correctly Classified Instances        6916               91.4815 %
Incorrectly Classified Instances       644                8.5185 %
Kappa statistic                          0.7704
Mean absolute error                      0.156 
Root mean squared error                  0.2552
Relative absolute error                 31.2096 %
Root relative squared error             51.036  %
Total Number of Instances             7560     


=== Detailed Accuracy By Class ===

                 TP Rate  FP Rate  Precision  Recall   F-Measure  MCC      ROC Area  PRC Area  Class
                 0.794    0.044    0.862      0.794    0.827      0.772    0.962     0.912     conclusion
                 0.956    0.206    0.931      0.956    0.944      0.772    0.962     0.984     premise
Weighted Avg.    0.915    0.164    0.913      0.915    0.914      0.772    0.962     0.966     


=== Confusion Matrix ===

    a    b   <-- classified as
 1537  398 |    a = conclusion
  246 5379 |    b = premise

....ok


# ADU 5 fold Validation

In [None]:
# output from output/2019-05-31_17:00:22.log

### summary of cross validation

In [None]:
set1=90.5621
set2=91.4556
set3=90.6409
set4=90.5261
set5=90.8758

### adu random classification

In [None]:
91.4919