### **ENVIRONMENT SETUP**

In [2]:
# ! rm -r data*
# ! wget http://argumentation.bplaced.net/arguana-data/dagstuhl-15512-argquality-corpus-v2.zip
# ! unzip dagstuhl-15512-argquality-corpus-v2.zip
# ! rm *.zip
# ! rm -r __MACOSX
# ! mv dagstuhl-15512-argquality-corpus-v2 data

### **IMPORT LIBRARIES**

In [3]:
# Set random seed

import random
random.seed(14071)

In [5]:
import re
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer # Bag of Words
from sklearn.feature_extraction.text import TfidfVectorizer # TF-IDF

from sklearn.preprocessing import LabelEncoder # Label Encoding
from sklearn.preprocessing import OneHotEncoder # One Hot Encoding

from sklearn.model_selection import train_test_split # Train Test Split

from sklearn.linear_model import LogisticRegression # LR Model
from sklearn.tree import DecisionTreeClassifier # DT Model
from sklearn.ensemble import RandomForestClassifier # RF Model

import tensorflow.keras as keras # Keras bindings
import tensorflow as tf # Tensorflow bindings

from sklearn.metrics import classification_report # Classification Report

from sklearn.model_selection import GridSearchCV # Grid Search

[nltk_data] Downloading package stopwords to /home/sri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/sri/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### **IMPORT DATA**

In [6]:
df = pd.read_csv("data/dagstuhl-15512-argquality-corpus-annotated.csv", sep='\t', encoding_errors="ignore")

In [7]:
df

Unnamed: 0,annotator,argumentative,overall quality,local acceptability,appropriateness,arrangement,clarity,cogency,effectiveness,global acceptability,...,global sufficiency,reasonableness,local relevance,credibility,emotional appeal,sufficiency,argument,#id,issue,stance
0,1,y,1 (Low),1 (Low),1 (Low),1 (Low),2 (Average),1 (Low),1 (Low),1 (Low),...,1 (Low),1 (Low),1 (Low),1 (Low),1 (Low),1 (Low),"it is true that bottled water is a waste, but ...",arg219250,ban-plastic-water-bottles,no-bad-for-the-economy
1,2,y,1 (Low),3 (High),2 (Average),2 (Average),3 (High),1 (Low),1 (Low),3 (High),...,1 (Low),2 (Average),2 (Average),2 (Average),2 (Average),1 (Low),"it is true that bottled water is a waste, but ...",arg219250,ban-plastic-water-bottles,no-bad-for-the-economy
2,3,y,2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),...,2 (Average),2 (Average),3 (High),2 (Average),1 (Low),2 (Average),"it is true that bottled water is a waste, but ...",arg219250,ban-plastic-water-bottles,no-bad-for-the-economy
3,1,y,2 (Average),3 (High),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),...,2 (Average),2 (Average),3 (High),3 (High),2 (Average),2 (Average),Most Americans on average recycle 86-88% of th...,arg219293,ban-plastic-water-bottles,no-bad-for-the-economy
4,2,y,1 (Low),2 (Average),1 (Low),2 (Average),2 (Average),1 (Low),1 (Low),2 (Average),...,1 (Low),1 (Low),2 (Average),2 (Average),2 (Average),1 (Low),Most Americans on average recycle 86-88% of th...,arg219293,ban-plastic-water-bottles,no-bad-for-the-economy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,2,y,2 (Average),2 (Average),3 (High),2 (Average),2 (Average),1 (Low),1 (Low),3 (High),...,1 (Low),2 (Average),2 (Average),2 (Average),2 (Average),1 (Low),Raffles neglected Singapore when he went aroun...,arg168822,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course
956,3,y,2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),...,2 (Average),2 (Average),3 (High),2 (Average),2 (Average),2 (Average),Raffles neglected Singapore when he went aroun...,arg168822,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course
957,1,y,2 (Average),2 (Average),2 (Average),2 (Average),1 (Low),2 (Average),2 (Average),2 (Average),...,2 (Average),2 (Average),3 (High),2 (Average),2 (Average),2 (Average),"Raffles doesn't care about the citizens, doesn...",arg168834,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course
958,2,y,2 (Average),2 (Average),3 (High),2 (Average),3 (High),1 (Low),2 (Average),2 (Average),...,1 (Low),2 (Average),2 (Average),2 (Average),3 (High),1 (Low),"Raffles doesn't care about the citizens, doesn...",arg168834,william-farquhar-ought-to-be-honoured-as-the-r...,yes-of-course


In [8]:
print(f"Number of annotations = {len(df['argument'])}")
print(f"Number of unique arguements = {len(np.unique(df['argument']))}") # Each argument was scored by 3 annotators
print(f"Number of unique issue = {len(np.unique(df['issue']))}")  # There are a total of 16 issues
print(f"Number of unique stance = {len(np.unique(df['stance']))}") # Each issue has on an avg 2 stance (positive and negative)

Number of annotations = 960
Number of unique arguements = 320
Number of unique issue = 16
Number of unique stance = 28


### **DATA CLEANING**

#### Remove statements are that are tagged as NOT argumentative

In [9]:
df[df["argumentative"] == "n"] # Statements that are tagged as NOT argumentative

Unnamed: 0,annotator,argumentative,overall quality,local acceptability,appropriateness,arrangement,clarity,cogency,effectiveness,global acceptability,...,global sufficiency,reasonableness,local relevance,credibility,emotional appeal,sufficiency,argument,#id,issue,stance
25,2,n,,,,,,,,,...,,,,,,,We will be able to ban water bottles until we ...,arg219242,ban-plastic-water-bottles,no-bad-for-the-economy
32,3,n,,,,,,,,,...,,,,,,,The high price of bottled water is not the wat...,arg219232,ban-plastic-water-bottles,yes-emergencies-only
37,2,n,,,,,,,,,...,,,,,,,A drop of water is worth more than a sack of g...,arg219210,ban-plastic-water-bottles,yes-emergencies-only
51,1,n,,,,,,,,,...,,,,,,,Yeah I have a bottle of water next to me its n...,arg219292,ban-plastic-water-bottles,yes-emergencies-only
52,2,n,,,,,,,,,...,,,,,,,Yeah I have a bottle of water next to me its n...,arg219292,ban-plastic-water-bottles,yes-emergencies-only
53,3,n,,,,,,,,,...,,,,,,,Yeah I have a bottle of water next to me its n...,arg219292,ban-plastic-water-bottles,yes-emergencies-only
97,2,n,,,,,,,,,...,,,,,,,This is just wrong we should not insult who we...,arg236317,christianity-or-atheism,christianity
104,3,n,,,,,,,,,...,,,,,,,I have a personal relationship with Christ. I ...,arg317490,christianity-or-atheism,christianity
105,1,n,,,,,,,,,...,,,,,,,God helps those who help themselves! So i will...,arg234318,christianity-or-atheism,christianity
106,2,n,,,,,,,,,...,,,,,,,God helps those who help themselves! So i will...,arg234318,christianity-or-atheism,christianity


In [10]:
statements = df[df["argumentative"] == "n"]["argument"].to_numpy() # Extract the statements
statements = np.unique(statements) # Extract the unique statements

for ele in statements: # Remove all occurrences of NOT argumentative statements
    df.drop(df[df['argument'] == ele].index, axis=0, inplace=True)

In [11]:
df[df["argumentative"] == "n"]

Unnamed: 0,annotator,argumentative,overall quality,local acceptability,appropriateness,arrangement,clarity,cogency,effectiveness,global acceptability,...,global sufficiency,reasonableness,local relevance,credibility,emotional appeal,sufficiency,argument,#id,issue,stance


In [12]:
print(f"Number of annotations = {len(df['argument'])}")
print(f"Number of unique arguements = {len(np.unique(df['argument']))}") # Each argument was scored by 3 annotators
print(f"Number of unique issue = {len(np.unique(df['issue']))}")  # There are a total of 16 issues
print(f"Number of unique stance = {len(np.unique(df['stance']))}") # Each issue has on an avg 2 stance (positive and negative)

Number of annotations = 912
Number of unique arguements = 304
Number of unique issue = 16
Number of unique stance = 28


#### Combine all Annotators' scores into a single score

In [13]:
argument = np.unique(df["argument"])

In [14]:
attributes = ["annotator", "overall quality", "cogency", "effectiveness", "reasonableness", "argument", "#id"]

cleaned_df = []

for arg in argument:

    new_df = df[df["argument"] == arg][attributes]
    flag = 0
    new_dict = {
        "#id": new_df["#id"].iloc[0],
        "argument": new_df["argument"].iloc[0],
    }

    for ele in ["overall quality", "cogency", "effectiveness", "reasonableness"]:
        if len(pd.value_counts(new_df[ele])) == 3:
            flag = 1
            break
        new_dict[ele] = pd.value_counts(new_df[ele]).index[0]
        
    if flag == 1:
        continue
    cleaned_df.append(new_dict)

cleaned_df = pd.DataFrame(cleaned_df)

In [15]:
df[df["#id"] == "28068"]

Unnamed: 0,annotator,argumentative,overall quality,local acceptability,appropriateness,arrangement,clarity,cogency,effectiveness,global acceptability,...,global sufficiency,reasonableness,local relevance,credibility,emotional appeal,sufficiency,argument,#id,issue,stance
144,1,y,2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),1 (Low),1 (Low),1 (Low),...,1 (Low),1 (Low),3 (High),1 (Low),2 (Average),1 (Low),"""Debates are based on convincing evidence. The...",28068,evolution-vs-creation,creation
145,2,y,1 (Low),1 (Low),3 (High),2 (Average),2 (Average),1 (Low),1 (Low),1 (Low),...,1 (Low),1 (Low),2 (Average),1 (Low),2 (Average),1 (Low),"""Debates are based on convincing evidence. The...",28068,evolution-vs-creation,creation
146,3,y,2 (Average),2 (Average),2 (Average),2 (Average),3 (High),2 (Average),1 (Low),2 (Average),...,1 (Low),1 (Low),2 (Average),1 (Low),2 (Average),2 (Average),"""Debates are based on convincing evidence. The...",28068,evolution-vs-creation,creation


In [16]:
cleaned_df

Unnamed: 0,#id,argument,overall quality,cogency,effectiveness,reasonableness
0,28068,"""Debates are based on convincing evidence. The...",2 (Average),1 (Low),1 (Low),1 (Low)
1,13270,"""If a women is raped"" is a good argument. Howe...",1 (Low),1 (Low),1 (Low),1 (Low)
2,13275,"""The government has no place to tell a woman w...",1 (Low),1 (Low),1 (Low),1 (Low)
3,12365,(I am writing this through Firefox) Emotions a...,2 (Average),1 (Low),1 (Low),2 (Average)
4,arg561672,1. It makes everyone equal - if children can w...,1 (Low),2 (Average),1 (Low),2 (Average)
...,...,...,...,...,...,...
256,arg334959,"yea, because even though there are many other ...",2 (Average),2 (Average),1 (Low),2 (Average)
257,arg335089,yes because if they fear getting hit than they...,1 (Low),1 (Low),1 (Low),1 (Low)
258,arg203922,"yes, i believe it's nice to have a school unif...",2 (Average),2 (Average),1 (Low),2 (Average)
259,arg596217,"yes,India has potential to lead the world.So, ...",1 (Low),1 (Low),1 (Low),1 (Low)


In [17]:
print(f"Number of arguements = {len(cleaned_df['argument'])}")

Number of arguements = 261


### **DATA PREPROCESSING**

In [18]:
text = cleaned_df["argument"]

In [19]:
stop_words = set(stopwords.words("english"))
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

In [20]:
def clean_text(text):
    text = text.replace('</br>', '') # Remove </br>
    text = re.sub(r'[^\w]', ' ', text) # Remove symbols
    text = re.sub(r'[ ]{2,}', ' ', text) # Remove extra spaces
    text = re.sub(r'[ \t]+$', '', text) # Remove trailing white spaces
    tokens = []
    for token in text.split():
        if token not in stop_words:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)
    #return token

In [21]:
cleaned_text = [clean_text(text) for text in text]
text = cleaned_text

### **VECTORIZE THE TEXT DATA**

In [22]:
# Using Bag of Words (BoW)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text)
X = X.toarray()

print(f"Shape of Vector = {X.shape}")

Shape of Vector = (261, 2001)


In [23]:
# Extracting the attributes

cogency = cleaned_df["cogency"].to_numpy()
effectiveness = cleaned_df["effectiveness"].to_numpy()
reasonableness = cleaned_df["reasonableness"].to_numpy()

In [24]:
attr_enc_map = {
    "1 (Low)": np.array([0, 0, 1]),
    "2 (Average)": np.array([0, 1, 0]),
    "3 (High)": np.array([1, 0, 0]),
}

def encode(array):
    temp_list = []
    for ele in array:
        temp_list.append(attr_enc_map[ele])
    return np.array(temp_list)

In [25]:
cogency = encode(cogency)
effectiveness = encode(effectiveness)
reasonableness = encode(reasonableness)

In [26]:
# Adding the attributes to text vector

X_new = []

for idx, x in enumerate(X):
    temp = np.concatenate((cogency[idx], effectiveness[idx], reasonableness[idx], x))
    X_new.append(temp)

X = np.array(X_new)

print(f"Shape of Vector = {X.shape}")

Shape of Vector = (261, 2010)


### **PREDICTING OVERALL QUALITY FROM TEXT+ATTRIBUTES**

In [27]:
y = cleaned_df["overall quality"].to_numpy()
y = y.reshape(-1, 1)

#### Label Encoding

In [28]:
# Label Encoding

encoder = LabelEncoder()
enc_y = encoder.fit_transform(y)

print(f"Size of Labels = {enc_y.shape}")

Size of Labels = (261,)


  y = column_or_1d(y, warn=True)


#### Train Test Split

In [30]:
X_train, X_test, y_train, y_test= train_test_split(X, enc_y, test_size=0.2, random_state=110)

print(f"Shape of Training Data: {X_train.shape}")
print(f"Shape of Training Labels: {y_train.shape}")
print(f"Shape of Testing Data: {X_test.shape}")
print(f"Shape of Testing Labels: {y_test.shape}")

Shape of Training Data: (208, 2010)
Shape of Training Labels: (208,)
Shape of Testing Data: (53, 2010)
Shape of Testing Labels: (53,)


#### Training Logistic Regression Model

In [31]:
model = LogisticRegression(C=0.1, dual=False, fit_intercept=True, penalty="l2", solver="newton-cg") # BoW
# model = LogisticRegression(C=0.01, dual=True, fit_intercept=False, penalty="l2", solver="liblinear") # TF-IDF
model.fit(X=X_train, y=y_train)

pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

print(f"Training Accuracy = {classification_report(y_train, pred_train, output_dict=True)['accuracy']}")
print(f"Testing Accuracy = {classification_report(y_test, pred_test, output_dict=True)['accuracy']}")

Training Accuracy = 1.0
Testing Accuracy = 0.9433962264150944


In [32]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94        26
           1       0.96      0.96      0.96        25
           2       1.00      0.50      0.67         2

    accuracy                           0.94        53
   macro avg       0.96      0.81      0.86        53
weighted avg       0.94      0.94      0.94        53



In [33]:
# # Grid Search

# parameters = {
#     "penalty": ["l1", "l2", "elasticnet", "none"],
#     "dual": [True, False],
#     "C": [1, 0.1, 0.01],
#     "fit_intercept": [True, False],
#     "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
# }

# model = LogisticRegression()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# grid_search.best_params_

#### Training Decision Tree Model

In [34]:
model = DecisionTreeClassifier(criterion="gini", max_features="sqrt", splitter="random") # BoW
# model = DecisionTreeClassifier(criterion="entropy", max_features="auto", splitter="random") # TF-IDF
model.fit(X=X_train, y=y_train)

pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

print(f"Training Accuracy = {classification_report(y_train, pred_train, output_dict=True)['accuracy']}")
print(f"Testing Accuracy = {classification_report(y_test, pred_test, output_dict=True)['accuracy']}")

Training Accuracy = 1.0
Testing Accuracy = 0.7735849056603774


In [35]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83        26
           1       0.78      0.72      0.75        25
           2       0.33      0.50      0.40         2

    accuracy                           0.77        53
   macro avg       0.64      0.69      0.66        53
weighted avg       0.78      0.77      0.78        53



In [36]:
# # Grid Search

# parameters = {
#     "criterion": ["gini", "entropy", "log_loss"],
#     "splitter": ["best", "random"],
#     "max_features": ["auto", "sqrt", "log2", None],
# }

# model = DecisionTreeClassifier()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# grid_search.best_params_

#### Training Random Forest Model

In [37]:
model = RandomForestClassifier() # BoW
# model = RandomForestClassifier(bootstrap=False, class_weight=None, 
#     criterion="entropy", max_features="sqrt", n_estimators=100, oob_score=False, warm_start=False) # TF-IDF
model.fit(X=X_train, y=y_train)

pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

print(f"Training Accuracy = {classification_report(y_train, pred_train, output_dict=True)['accuracy']}")
print(f"Testing Accuracy = {classification_report(y_test, pred_test, output_dict=True)['accuracy']}")

Training Accuracy = 1.0
Testing Accuracy = 0.9622641509433962


In [38]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96        26
           1       0.96      0.96      0.96        25
           2       1.00      1.00      1.00         2

    accuracy                           0.96        53
   macro avg       0.97      0.97      0.97        53
weighted avg       0.96      0.96      0.96        53



In [39]:
# # Grid Search

# parameters = {
#     "n_estimators": [100, 200, 300],
#     "criterion": ["gini", "entropy", "log_loss"],
#     "max_features": ["auto", "sqrt", "log2", None],
#     "bootstrap": [True, False],
#     "oob_score": [True, False],
#     "warm_start": [True, False],
#     "class_weight": ["balanced", "balanced_subsample", None],
# }

# model = RandomForestClassifier()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# grid_search.best_params_

#### Training Neural Network

In [40]:
# One Hot Encoding

encoder = OneHotEncoder()
enc_y = encoder.fit_transform(y)
enc_y = enc_y.toarray()

print(f"Size of Labels = {enc_y.shape}")
print(f"Label Sample: {enc_y[0]}")

Size of Labels = (261, 3)
Label Sample: [0. 1. 0.]


In [41]:
# Train Test Split

X_train, X_test, y_train, y_test= train_test_split(X, enc_y, test_size=0.2, random_state=110)

print(f"Shape of Training Data: {X_train.shape}")
print(f"Shape of Training Labels: {y_train.shape}")
print(f"Shape of Testing Data: {X_test.shape}")
print(f"Shape of Testing Labels: {y_test.shape}")

Shape of Training Data: (208, 2010)
Shape of Training Labels: (208, 3)
Shape of Testing Data: (53, 2010)
Shape of Testing Labels: (53, 3)


In [42]:
# Define Custom Callback

class MyThresholdCallback(tf.keras.callbacks.Callback):
    def __init__(self, threshold):
        super(MyThresholdCallback, self).__init__()
        self.threshold = threshold
 
    def on_epoch_end(self, epoch, logs=None): 
        val_acc = logs["val_accuracy"]
        if val_acc >= self.threshold:
            self.model.stop_training = True

In [47]:
# Define Model
model = keras.models.Sequential([
    keras.layers.Dense(32, input_dim=X_train.shape[1], activation="relu"),
    keras.layers.Dropout(0.6),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dropout(0.6),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(3, activation="softmax"),
])

loss_function = keras.losses.CategoricalCrossentropy() # Define loss function
# loss_function = keras.losses.SparseCategoricalCrossentropy() # Define loss function
optimizer = keras.optimizers.SGD(learning_rate=0.005) # Define optimizer

callback = MyThresholdCallback(threshold=0.94) # Set callback at 94% accuracy

model.compile(optimizer=optimizer, loss=loss_function, metrics=["accuracy"]) # Compile the model

In [48]:
# Train the model

history = model.fit(X_train, y_train, epochs=100, batch_size=1, 
    validation_data=(X_test, y_test), callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


In [49]:
pred_train = [np.argmax(ele) for ele in model.predict(X_train)]
pred_test = [np.argmax(ele) for ele in model.predict(X_test)]

new_y_train = [np.argmax(ele) for ele in y_train]
new_y_test = [np.argmax(ele) for ele in y_test]

print(f"Training Accuracy = {classification_report(new_y_train, pred_train, output_dict=True)['accuracy']}")
print(f"Testing Accuracy = {classification_report(new_y_test, pred_test, output_dict=True)['accuracy']}")

Training Accuracy = 0.9951923076923077
Testing Accuracy = 0.9433962264150944


In [50]:
print(classification_report(new_y_test, pred_test))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94        26
           1       0.96      0.96      0.96        25
           2       1.00      0.50      0.67         2

    accuracy                           0.94        53
   macro avg       0.96      0.81      0.86        53
weighted avg       0.94      0.94      0.94        53

