# **TF-IDF**

## **ENVIRONMENT SETUP**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Download the dataset

!rm arg_quality_rank_30k.csv
!wget "https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip"
!unzip *.zip
!rm *.zip
!rm readme.txt

rm: cannot remove 'arg_quality_rank_30k.csv': No such file or directory
--2023-04-08 11:42:11--  https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip
Resolving www.research.ibm.com (www.research.ibm.com)... 52.116.220.135
Connecting to www.research.ibm.com (www.research.ibm.com)|52.116.220.135|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip [following]
--2023-04-08 11:42:12--  https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip
Resolving research.ibm.com (research.ibm.com)... 52.116.220.135
Connecting to research.ibm.com (research.ibm.com)|52.116.220.135|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1524714 (1.5M) [application/zip]
Saving to: ‘IBM_Debater_(R)_arg_quality_rank_30k.zip’


2023-04-08 11:42:14 (1.01 MB/s) - ‘IBM_Debater_(R)_arg_quality_rank_30k.z

## **IMPORT DATASET**

In [3]:
import pandas as pd

df = pd.read_csv("./arg_quality_rank_30k.csv")
df.head()

Unnamed: 0,argument,topic,set,WA,MACE-P,stance_WA,stance_WA_conf
0,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,train,0.846165,0.297659,1,1.0
1,.a multi-party system would be too confusing a...,We should adopt a multi-party system,train,0.891271,0.726133,-1,1.0
2,\ero-tolerance policy in schools should not be...,We should adopt a zero-tolerance policy in sch...,dev,0.721192,0.396953,-1,1.0
3,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,train,0.730395,0.225212,-1,1.0
4,"100% agree, should they do that, it would be a...",We should abolish safe spaces,train,0.236686,0.004104,1,0.805517


### Convert into Classes

In [4]:
import numpy as np
from sklearn.cluster import KMeans

In [5]:
kmeans = KMeans(n_clusters=2, n_init="auto", max_iter=1000, random_state=431)
X = np.array(df["WA"]).reshape(-1, 1)
X = kmeans.fit_predict(X)
df["class"] = X

In [6]:
df.head()

Unnamed: 0,argument,topic,set,WA,MACE-P,stance_WA,stance_WA_conf,class
0,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,train,0.846165,0.297659,1,1.0,1
1,.a multi-party system would be too confusing a...,We should adopt a multi-party system,train,0.891271,0.726133,-1,1.0,1
2,\ero-tolerance policy in schools should not be...,We should adopt a zero-tolerance policy in sch...,dev,0.721192,0.396953,-1,1.0,1
3,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,train,0.730395,0.225212,-1,1.0,1
4,"100% agree, should they do that, it would be a...",We should abolish safe spaces,train,0.236686,0.004104,1,0.805517,0


### Split into Train and Test Sets

In [7]:
df_train = df[df["set"] != "test"].reset_index(drop=True) # Combine train and dev into train set
df_train = df_train.drop(["set"], axis=1)

df_test = df[df["set"] == "test"].reset_index(drop=True)
df_test = df_test.drop(["set"], axis=1)

### Display Dataset Metrics

In [8]:
print(f"Length of dataset = {len(df)}")
print(f"Number of training data = {len(df_train)}")
print(f"Number of testing data = {len(df_test)}")

Length of dataset = 30497
Number of training data = 24182
Number of testing data = 6315


In [9]:
print(f"Number of Topics = {len(np.unique(df.topic))}")
print(f"Number of Topics in training data = {len(np.unique(df_train.topic))}")
print(f"Number of Topics in testing data = {len(np.unique(df_test.topic))}")

Number of Topics = 71
Number of Topics in training data = 56
Number of Topics in testing data = 15


In [10]:
print(f"Number of Classes = {len(np.unique(df['class']))}")

for label in np.unique(df["class"]):
    print(f"Number of Class {label} in training data = {len(df_train[df_train['class']==label])}")
    print(f"Number of Class {label} in testing data = {len(df_test[df_test['class']==label])}")

Number of Classes = 2
Number of Class 0 in training data = 6987
Number of Class 0 in testing data = 1979
Number of Class 1 in training data = 17195
Number of Class 1 in testing data = 4336


## **TEXT CLEANING**

In [11]:
import re

import spacy

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from gensim.utils import simple_preprocess

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
stop_words = stopwords.words('english')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags=["NOUN", "ADJ", "VERB"]

In [13]:
def clean_text(text):
    text = text.lower() # Convert the text into lowercase
    text = text.replace('</br>', '') # Remove </br>
    text = text.replace('\n', '') # Remove \n
    
    # Remove quotes
    text = re.sub(r"\'", "", text) 
    text = re.sub(r"\"", "", text) 
    
    text = re.sub(r"[^\w]", " ", text) # Remove all symbols

    text = re.sub(r'[ ]{2,}', ' ', text) # Remove extra spaces
    text = re.sub(r'[ \t]+$', '', text) # Remove trailing white spaces

    text = simple_preprocess(str(text), deacc=True) # Tokenize the texts

    # Remove stopwords
    tokens = []
    for token in text:
        if token not in stop_words:
            tokens.append(token)
    text = tokens

    # Lemmatize the tokens
    text = " ".join(text)
    text = nlp(text)
    lemmatized_tokens = []
    for token in text:
        if token.pos_ in allowed_postags:
            lemmatized_tokens.append(token.lemma_)
    text = lemmatized_tokens

    text = " ".join(text)
    return text

In [14]:
df_train["argument"] = df_train["argument"].apply(clean_text)
df_test["argument"] = df_test["argument"].apply(clean_text)

In [15]:
df_train["argument"]

0        marriage keep time abandon old thinking bring ...
1        system confuse get consensus general public di...
2        ero tolerance policy school adopt circumstance...
3        people reach limit come quality life able end ...
4                                         agree good thing
                               ...                        
24177    zoo trap animal meaningless life amuse human o...
24178                               zoo treat animal close
24179                       zoo imprison animal cause harm
24180    zoo work educational center cause extinction a...
24181                       zoo help breed endanger specie
Name: argument, Length: 24182, dtype: object

## **FEATURE EXTRACTION**

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# Build corpus using all texts
X = []
X.extend(df_train["argument"].tolist())
X.extend(df_test["argument"].tolist())

In [18]:
# Fit vectorizer
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(X)

In [19]:
# Transform the text
X = vectorizer.transform(X).toarray()
X_train = vectorizer.transform(df_train["argument"]).toarray()
X_test = vectorizer.transform(df_test["argument"]).toarray()

In [20]:
print(f"Shape of training feature vector: {X_train.shape}")

Shape of training feature vector: (24182, 8794)


In [21]:
# Extract vocabulary
vocab = vectorizer.vocabulary_

### Save Vectorizer

In [22]:
import pickle

with open('/content/drive/MyDrive/tfidf_cls_vectorizer', 'ab') as file:
    pickle.dump(vectorizer, file)

## **TARGET EXTRACTION**

In [23]:
y = np.array(df["class"])
y_train = np.array(df_train["class"])
y_test = np.array(df_test["class"])

In [24]:
print(f"Shape of training target vector: {y_train.shape}")

Shape of training target vector: (24182,)


## **LOGISTIC REGRESSION**

### GridSearch

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [26]:
parameters = {
    "penalty": ("l1", "l2", "elasticnet", None),
    "dual": (True, False),
    "C": (1, 0.1, 0.01),
    "fit_intercept": (True, False),
    "solver": ("lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"),
}

model = LogisticRegression()

clf = GridSearchCV(
    model, parameters, n_jobs=-1,
    scoring="f1_weighted",
)

In [27]:
# Take a random sample

import random

size = len(X_train) // 10

idxs = random.sample(range(0, len(X_train)), size)

clf_X = np.array([X_train[idx] for idx in idxs])
clf_y = np.array([y_train[idx] for idx in idxs])

In [None]:
# clf = clf.fit(clf_X, clf_y)

In [29]:
# print("The best parameters are:")
# best_params = clf.best_params_
# for key in best_params:
#     print(f"{key}=>{best_params[key]}")

The best parameters are:
C=>1
dual=>False
fit_intercept=>False
penalty=>l1
solver=>liblinear


### Train Model

In [37]:
# model = LogisticRegression(max_iter=1000)
model = LogisticRegression(
    C=1, dual=False, fit_intercept=False,
    penalty="l1", solver="liblinear", max_iter=1000
)

history = model.fit(X_train, y_train)

lr_pred = model.predict(X_test)

### Evaluate Model

In [38]:
from sklearn.metrics import classification_report

In [39]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.50      0.16      0.25      1979
           1       0.71      0.93      0.80      4336

    accuracy                           0.69      6315
   macro avg       0.60      0.54      0.52      6315
weighted avg       0.64      0.69      0.63      6315



### Save Model

In [40]:
import pickle

with open('/content/drive/MyDrive/tfidf_cls_lr_model', 'ab') as file:
    pickle.dump(model, file)

## **SUPPORT VECTOR MACHINE**

### GridSearch

In [41]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [42]:
parameters = {
    "C": (1, 0.1, 0.01),
    "kernel": ("linear", "poly", "rbf", "sigmoid"),
    # "degree": (2, 3, 4),
    "shrinking": (True, False),
    # "decision_function_shape": ("ovo", "ovr"),
    # "break_ties": (True, False),
    "random_state": [431]
}

model = SVC()

clf = GridSearchCV(
    model, parameters, n_jobs=-1,
    scoring="f1_weighted",
)

In [43]:
# Take a random sample

import random

size = len(X_train) // 10

idxs = random.sample(range(0, len(X_train)), size)

clf_X = np.array([X_train[idx] for idx in idxs])
clf_y = np.array([y_train[idx] for idx in idxs])

In [44]:
# clf = clf.fit(clf_X, clf_y)

In [45]:
# print("The best parameters are:")
# best_params = clf.best_params_
# for key in best_params:
#     print(f"{key}=>{best_params[key]}")

The best parameters are:
C=>1
kernel=>linear
random_state=>431
shrinking=>True


### Train Model

In [46]:
model = SVC(
    C=1, kernel="linear", random_state=431,
    shrinking=True
)

history = model.fit(X_train, y_train)

svm_pred = model.predict(X_test)

### Evaluate Model

In [47]:
from sklearn.metrics import classification_report

In [48]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.50      0.16      0.25      1979
           1       0.71      0.93      0.80      4336

    accuracy                           0.69      6315
   macro avg       0.60      0.54      0.52      6315
weighted avg       0.64      0.69      0.63      6315



### Save Model

In [49]:
import pickle

with open('/content/drive/MyDrive/tfidf_cls_svm_model', 'ab') as file:
    pickle.dump(model, file)