## Environment Setup

In [26]:
# Download the dataset

!rm arg_quality_rank_30k.csv
!wget "https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip"
!unzip *.zip
!rm *.zip
!rm readme.txt

--2023-04-10 04:58:03--  https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip
Resolving www.research.ibm.com (www.research.ibm.com)... 52.116.220.135
Connecting to www.research.ibm.com (www.research.ibm.com)|52.116.220.135|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip [following]
--2023-04-10 04:58:03--  https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip
Resolving research.ibm.com (research.ibm.com)... 52.116.220.135
Connecting to research.ibm.com (research.ibm.com)|52.116.220.135|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1524714 (1.5M) [application/zip]
Saving to: ‘IBM_Debater_(R)_arg_quality_rank_30k.zip’


2023-04-10 04:58:05 (1.24 MB/s) - ‘IBM_Debater_(R)_arg_quality_rank_30k.zip’ saved [1524714/1524714]

Archive:  IBM_Debater_(R)_arg_quality_rank_

## Import Libraries

In [27]:
import pandas as pd
import numpy as np

import re
import spacy
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.cluster import KMeans

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Seperating Using Avg

In [28]:
# Import dataset
df = pd.read_csv("./arg_quality_rank_30k.csv")

# Calculate mean
WA_mean = np.mean(df["WA"])
print(f"Average WA is: {WA_mean}")

# Convert into classes
df["class"] = df["WA"].apply(lambda x: 1 if x >= WA_mean else 0)

Average WA is: 0.7913285945189035


In [29]:
# Split into train and test sets
df_train = df[df["set"] != "test"].reset_index(drop=True)
df_train = df_train.drop(["set"], axis=1)
df_test = df[df["set"] == "test"].reset_index(drop=True)
df_test = df_test.drop(["set"], axis=1)

# Display dataset metrics
print(f"Length of dataset = {len(df)}")
print(f"Number of training data = {len(df_train)}")
print(f"Number of testing data = {len(df_test)}")
print()
print(f"Number of Topics = {len(np.unique(df.topic))}")
print(f"Number of Topics in training data = {len(np.unique(df_train.topic))}")
print(f"Number of Topics in testing data = {len(np.unique(df_test.topic))}")
print()
print(f"Number of Classes = {len(np.unique(df['class']))}")
for label in np.unique(df["class"]):
    print(f"Number of Class {label} in training data = {len(df_train[df_train['class']==label])}")
    print(f"Number of Class {label} in testing data = {len(df_test[df_test['class']==label])}")

Length of dataset = 30497
Number of training data = 24182
Number of testing data = 6315

Number of Topics = 71
Number of Topics in training data = 56
Number of Topics in testing data = 15

Number of Classes = 2
Number of Class 0 in training data = 10009
Number of Class 0 in testing data = 2754
Number of Class 1 in training data = 14173
Number of Class 1 in testing data = 3561


In [30]:
# Text cleaning
stop_words = stopwords.words('english')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags=["NOUN", "ADJ", "VERB"]

def clean_text(text):
    text = text.lower()
    text = text.replace('</br>', '')
    text = text.replace('\n', '')
    text = re.sub(r"\'", "", text) 
    text = re.sub(r"\"", "", text) 
    text = re.sub(r"[^\w]", " ", text)
    text = re.sub(r'[ ]{2,}', ' ', text)
    text = re.sub(r'[ \t]+$', '', text)
    text = simple_preprocess(str(text), deacc=True)
    tokens = []
    for token in text:
        if token not in stop_words:
            tokens.append(token)
    text = tokens
    text = " ".join(text)
    text = nlp(text)
    lemmatized_tokens = []
    for token in text:
        if token.pos_ in allowed_postags:
            lemmatized_tokens.append(token.lemma_)
    text = lemmatized_tokens
    text = " ".join(text)
    return text

df_train["argument"] = df_train["argument"].apply(clean_text)
df_test["argument"] = df_test["argument"].apply(clean_text)

In [31]:
# Build corpus using all texts
X = []
X.extend(df_train["argument"].tolist())
X.extend(df_test["argument"].tolist())

# Fit vectorizer
vectorizer = CountVectorizer()
vectorizer = vectorizer.fit(X)

# Transform the text
X = vectorizer.transform(X).toarray()
X_train = vectorizer.transform(df_train["argument"]).toarray()
X_test = vectorizer.transform(df_test["argument"]).toarray()

# Target extraction
y = np.array(df["class"])
y_train = np.array(df_train["class"])
y_test = np.array(df_test["class"])

In [32]:
# Train and evaluate basic logistic regression model
model = LogisticRegression(max_iter=1000)
history = model.fit(X_train, y_train)
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.50      0.64      0.56      2754
           1       0.65      0.51      0.57      3561

    accuracy                           0.57      6315
   macro avg       0.57      0.58      0.57      6315
weighted avg       0.58      0.57      0.57      6315



## Seperating Using Clustering

In [33]:
# Import dataset
df = pd.read_csv("./arg_quality_rank_30k.csv")

# Convert into classes
kmeans = KMeans(n_clusters=2, n_init="auto", max_iter=1000, random_state=431)
X = np.array(df["WA"]).reshape(-1, 1)
X = kmeans.fit_predict(X)
df["class"] = X

In [34]:
# Split into train and test sets
df_train = df[df["set"] != "test"].reset_index(drop=True)
df_train = df_train.drop(["set"], axis=1)
df_test = df[df["set"] == "test"].reset_index(drop=True)
df_test = df_test.drop(["set"], axis=1)

# Display dataset metrics
print(f"Length of dataset = {len(df)}")
print(f"Number of training data = {len(df_train)}")
print(f"Number of testing data = {len(df_test)}")
print()
print(f"Number of Topics = {len(np.unique(df.topic))}")
print(f"Number of Topics in training data = {len(np.unique(df_train.topic))}")
print(f"Number of Topics in testing data = {len(np.unique(df_test.topic))}")
print()
print(f"Number of Classes = {len(np.unique(df['class']))}")
for label in np.unique(df["class"]):
    print(f"Number of Class {label} in training data = {len(df_train[df_train['class']==label])}")
    print(f"Number of Class {label} in testing data = {len(df_test[df_test['class']==label])}")

Length of dataset = 30497
Number of training data = 24182
Number of testing data = 6315

Number of Topics = 71
Number of Topics in training data = 56
Number of Topics in testing data = 15

Number of Classes = 2
Number of Class 0 in training data = 6987
Number of Class 0 in testing data = 1979
Number of Class 1 in training data = 17195
Number of Class 1 in testing data = 4336


In [35]:
# Text cleaning
stop_words = stopwords.words('english')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags=["NOUN", "ADJ", "VERB"]

def clean_text(text):
    text = text.lower()
    text = text.replace('</br>', '')
    text = text.replace('\n', '')
    text = re.sub(r"\'", "", text) 
    text = re.sub(r"\"", "", text) 
    text = re.sub(r"[^\w]", " ", text)
    text = re.sub(r'[ ]{2,}', ' ', text)
    text = re.sub(r'[ \t]+$', '', text)
    text = simple_preprocess(str(text), deacc=True)
    tokens = []
    for token in text:
        if token not in stop_words:
            tokens.append(token)
    text = tokens
    text = " ".join(text)
    text = nlp(text)
    lemmatized_tokens = []
    for token in text:
        if token.pos_ in allowed_postags:
            lemmatized_tokens.append(token.lemma_)
    text = lemmatized_tokens
    text = " ".join(text)
    return text

df_train["argument"] = df_train["argument"].apply(clean_text)
df_test["argument"] = df_test["argument"].apply(clean_text)

In [36]:
# Build corpus using all texts
X = []
X.extend(df_train["argument"].tolist())
X.extend(df_test["argument"].tolist())

# Fit vectorizer
vectorizer = CountVectorizer()
vectorizer = vectorizer.fit(X)

# Transform the text
X = vectorizer.transform(X).toarray()
X_train = vectorizer.transform(df_train["argument"]).toarray()
X_test = vectorizer.transform(df_test["argument"]).toarray()

# Target extraction
y = np.array(df["class"])
y_train = np.array(df_train["class"])
y_test = np.array(df_test["class"])

In [37]:
# Train and evaluate basic logistic regression model
model = LogisticRegression(max_iter=1000)
history = model.fit(X_train, y_train)
pred = model.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.43      0.40      0.42      1979
           1       0.73      0.76      0.75      4336

    accuracy                           0.64      6315
   macro avg       0.58      0.58      0.58      6315
weighted avg       0.64      0.64      0.64      6315

