# **TF-IDF**

## **ENVIRONMENT SETUP**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Download the dataset

!rm -r data*
!wget http://argumentation.bplaced.net/arguana-data/dagstuhl-15512-argquality-corpus-v2.zip
!unzip dagstuhl-15512-argquality-corpus-v2.zip
!rm *.zip
!rm -r __MACOSX
!mv dagstuhl-15512-argquality-corpus-v2 data

## **IMPORT DATASET**

In [3]:
import pandas as pd

df = pd.read_csv("data/dagstuhl-15512-argquality-corpus-annotated.csv", sep='\t', encoding_errors="ignore")
df.head(3)

Unnamed: 0,annotator,argumentative,overall quality,local acceptability,appropriateness,arrangement,clarity,cogency,effectiveness,global acceptability,...,global sufficiency,reasonableness,local relevance,credibility,emotional appeal,sufficiency,argument,#id,issue,stance
0,1,y,1 (Low),1 (Low),1 (Low),1 (Low),2 (Average),1 (Low),1 (Low),1 (Low),...,1 (Low),1 (Low),1 (Low),1 (Low),1 (Low),1 (Low),"it is true that bottled water is a waste, but ...",arg219250,ban-plastic-water-bottles,no-bad-for-the-economy
1,2,y,1 (Low),3 (High),2 (Average),2 (Average),3 (High),1 (Low),1 (Low),3 (High),...,1 (Low),2 (Average),2 (Average),2 (Average),2 (Average),1 (Low),"it is true that bottled water is a waste, but ...",arg219250,ban-plastic-water-bottles,no-bad-for-the-economy
2,3,y,2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),2 (Average),...,2 (Average),2 (Average),3 (High),2 (Average),1 (Low),2 (Average),"it is true that bottled water is a waste, but ...",arg219250,ban-plastic-water-bottles,no-bad-for-the-economy


In [4]:
import numpy as np

print(f"Number of annotations = {len(df['argument'])}")
print(f"Number of unique arguements = {len(np.unique(df['argument']))}") # Each argument was scored by 3 annotators
print(f"Number of unique issue = {len(np.unique(df['issue']))}")  # There are a total of 16 issues
print(f"Number of unique stance = {len(np.unique(df['stance']))}") # Each issue has on an avg 2 stance (positive and negative)

Number of annotations = 960
Number of unique arguements = 320
Number of unique issue = 16
Number of unique stance = 28


### Remove non-arguments

In [5]:
df = df[df["argumentative"] == "y"]

In [6]:
print(f"Number of annotations = {len(df['argument'])}")
print(f"Number of unique arguements = {len(np.unique(df['argument']))}") # Each argument was scored by 3 annotators
print(f"Number of unique issue = {len(np.unique(df['issue']))}")  # There are a total of 16 issues
print(f"Number of unique stance = {len(np.unique(df['stance']))}") # Each issue has on an avg 2 stance (positive and negative)

Number of annotations = 935
Number of unique arguements = 316
Number of unique issue = 16
Number of unique stance = 28


### Consolidate annotator scores

In [7]:
argument = np.unique(df["argument"])

attributes = ["annotator", "overall quality", "cogency", "effectiveness", "reasonableness", "argument", "#id"]

cleaned_df = []

for arg in argument:

    new_df = df[df["argument"] == arg][attributes]
    flag = 0
    new_dict = {
        "#id": new_df["#id"].iloc[0],
        "argument": new_df["argument"].iloc[0],
    }

    for ele in ["overall quality", "cogency", "effectiveness", "reasonableness"]:
        if len(pd.value_counts(new_df[ele])) == 3:
            flag = 1
            break
        new_dict[ele] = pd.value_counts(new_df[ele]).index[0]
        
    if flag == 1:
        continue
    cleaned_df.append(new_dict)

In [8]:
df = pd.DataFrame(cleaned_df)

In [9]:
print(f"Number of annotations = {len(df['argument'])}")
print(f"Number of unique arguements = {len(np.unique(df['argument']))}") # Each argument was scored by 3 annotators

Number of annotations = 273
Number of unique arguements = 273


### Perform train-test split

In [10]:
df_train = df.sample(frac=0.8, random_state=101)
df_test = df.drop(df_train.index)

### Display Dataset Metrics

In [11]:
print(f"Length of dataset = {len(df)}")
print(f"Number of training data = {len(df_train)}")
print(f"Number of testing data = {len(df_test)}")

Length of dataset = 273
Number of training data = 218
Number of testing data = 55


In [12]:
print(f"Number of Classes = {len(np.unique(df['overall quality']))}")

for label in np.unique(df["overall quality"]):
    print(f"Number of Class {label} in training data = {len(df_train[df_train['overall quality']==label])}")
    print(f"Number of Class {label} in testing data = {len(df_test[df_test['overall quality']==label])}")

Number of Classes = 3
Number of Class 1 (Low) in training data = 128
Number of Class 1 (Low) in testing data = 31
Number of Class 2 (Average) in training data = 77
Number of Class 2 (Average) in testing data = 21
Number of Class 3 (High) in training data = 13
Number of Class 3 (High) in testing data = 3


## **TEXT CLEANING**

In [13]:
import re

import spacy

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from gensim.utils import simple_preprocess

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
stop_words = stopwords.words('english')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags=["NOUN", "ADJ", "VERB"]

In [15]:
def clean_text(text):
    text = text.lower() # Convert the text into lowercase
    text = text.replace('</br>', '') # Remove </br>
    text = text.replace('\n', '') # Remove \n
    
    # Remove quotes
    text = re.sub(r"\'", "", text) 
    text = re.sub(r"\"", "", text) 
    
    text = re.sub(r"[^\w]", " ", text) # Remove all symbols

    text = re.sub(r'[ ]{2,}', ' ', text) # Remove extra spaces
    text = re.sub(r'[ \t]+$', '', text) # Remove trailing white spaces

    text = simple_preprocess(str(text), deacc=True) # Tokenize the texts

    # Remove stopwords
    tokens = []
    for token in text:
        if token not in stop_words:
            tokens.append(token)
    text = tokens

    # Lemmatize the tokens
    text = " ".join(text)
    text = nlp(text)
    lemmatized_tokens = []
    for token in text:
        if token.pos_ in allowed_postags:
            lemmatized_tokens.append(token.lemma_)
    text = lemmatized_tokens

    text = " ".join(text)
    return text

In [16]:
df_train["argument"] = df_train["argument"].apply(clean_text)
df_test["argument"] = df_test["argument"].apply(clean_text)

In [17]:
df_train["argument"]

33     potential do well job run way run make order c...
13     spend billion bottled water year ban sale hurt...
189    s good argument think reason see squirrel wind...
269                                         fear get hit
135    kid fat day help help relize important lame fu...
                             ...                        
97     think personal pursuit important order help ot...
207    grow accord research consulting do beverage ma...
203    s reason say well prove safe fast sleeker cust...
17     ambitious young person want become lawful succ...
140    murder circumstance right person commit know c...
Name: argument, Length: 218, dtype: object

## **FEATURE EXTRACTION**

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# Build corpus using all texts
X = []
X.extend(df_train["argument"].tolist())
X.extend(df_test["argument"].tolist())

In [20]:
# Fit vectorizer
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(X)

In [21]:
# Transform the text
X = vectorizer.transform(X).toarray()
X_train = vectorizer.transform(df_train["argument"]).toarray()
X_test = vectorizer.transform(df_test["argument"]).toarray()

In [22]:
print(f"Shape of training feature vector: {X_train.shape}")

Shape of training feature vector: (218, 1690)


In [23]:
# Extract vocabulary
vocab = vectorizer.vocabulary_

### Save Vectorizer

In [24]:
import pickle

with open('/content/drive/MyDrive/tfidf_argqual_vectorizer', 'wb') as file:
    pickle.dump(vectorizer, file)

## **TARGET EXTRACTION**

In [25]:
y = np.array(df["overall quality"])
y_train = np.array(df_train["overall quality"])
y_test = np.array(df_test["overall quality"])

In [26]:
encoder = {
    "1 (Low)": 0,
    "2 (Average)": 1,
    "3 (High)": 2
}
y = np.array([encoder[ele] for ele in y])
y_train = np.array([encoder[ele] for ele in y_train])
y_test = np.array([encoder[ele] for ele in y_test])

In [27]:
print(f"Shape of training target vector: {y_train.shape}")

Shape of training target vector: (218,)


## **LOGISTIC REGRESSION**

### GridSearch

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [29]:
parameters = {
    "penalty": ("l1", "l2", "elasticnet", None),
    "dual": (True, False),
    "C": (1, 0.1, 0.01),
    "fit_intercept": (True, False),
    "solver": ("lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"),
    "max_iter": [1000],
}

model = LogisticRegression()

clf = GridSearchCV(
    model, parameters, n_jobs=-1,
    scoring="f1_weighted",
)

In [30]:
# Take a random sample

import random

size = len(X_train) // 1

idxs = random.sample(range(0, len(X_train)), size)

clf_X = np.array([X_train[idx] for idx in idxs])
clf_y = np.array([y_train[idx] for idx in idxs])

In [None]:
clf = clf.fit(clf_X, clf_y)

In [32]:
print("The best parameters are:")
print(clf.best_estimator_)

The best parameters are:
LogisticRegression(C=0.1, max_iter=1000, penalty=None, solver='sag')


### Train Model

In [33]:
# model = LogisticRegression(max_iter=1000)
# model = LogisticRegression(
#     C=0.01, dual=True, fit_intercept=False,
#     penalty="l2", solver="liblinear", max_iter=1000
# )
model = clf.best_estimator_

history = model.fit(X_train, y_train)

lr_pred = model.predict(X_test)



### Evaluate Model

In [34]:
from sklearn.metrics import classification_report

In [35]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.61      0.81      0.69        31
           1       0.50      0.33      0.40        21
           2       0.00      0.00      0.00         3

    accuracy                           0.58        55
   macro avg       0.37      0.38      0.36        55
weighted avg       0.53      0.58      0.54        55



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Save Model

In [36]:
import pickle

with open('/content/drive/MyDrive/tfidf_argqual_lr_model', 'wb') as file:
    pickle.dump(model, file)

## **SUPPORT VECTOR MACHINE**

### GridSearch

In [37]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [38]:
parameters = {
    "C": (1, 0.1, 0.01),
    "kernel": ("linear", "poly", "rbf", "sigmoid"),
    # "degree": (2, 3, 4),
    "shrinking": (True, False),
    # "decision_function_shape": ("ovo", "ovr"),
    # "break_ties": (True, False),
    "random_state": [101]
}

model = SVC()

clf = GridSearchCV(
    model, parameters, n_jobs=-1,
    scoring="f1_weighted",
)

In [39]:
# Take a random sample

import random

size = len(X_train) // 1

idxs = random.sample(range(0, len(X_train)), size)

clf_X = np.array([X_train[idx] for idx in idxs])
clf_y = np.array([y_train[idx] for idx in idxs])

In [40]:
clf = clf.fit(clf_X, clf_y)

In [41]:
print("The best parameters are:")
print(clf.best_estimator_)

The best parameters are:
SVC(C=1, kernel='linear', random_state=101)


### Train Model

In [42]:
# model = SVC(
#     C=1, kernel="linear", random_state=431,
#     shrinking=True
# )
model = clf.best_estimator_

history = model.fit(X_train, y_train)

svm_pred = model.predict(X_test)

### Evaluate Model

In [43]:
from sklearn.metrics import classification_report

In [44]:
print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

           0       0.58      0.97      0.72        31
           1       0.33      0.05      0.08        21
           2       0.00      0.00      0.00         3

    accuracy                           0.56        55
   macro avg       0.30      0.34      0.27        55
weighted avg       0.45      0.56      0.44        55



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Save Model

In [45]:
import pickle

with open('/content/drive/MyDrive/tfidf_argqual_svm_model', 'wb') as file:
    pickle.dump(model, file)