# **TF-IDF**

## **ENVIRONMENT SETUP**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Download the dataset

!rm arg_quality_rank_30k.csv
!wget "https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip"
!unzip *.zip
!rm *.zip
!rm readme.txt

rm: cannot remove 'arg_quality_rank_30k.csv': No such file or directory
--2023-04-03 07:23:12--  https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip
Resolving www.research.ibm.com (www.research.ibm.com)... 52.116.220.135
Connecting to www.research.ibm.com (www.research.ibm.com)|52.116.220.135|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip [following]
--2023-04-03 07:23:12--  https://research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_arg_quality_rank_30k.zip
Resolving research.ibm.com (research.ibm.com)... 52.116.220.135
Connecting to research.ibm.com (research.ibm.com)|52.116.220.135|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1524714 (1.5M) [application/zip]
Saving to: ‘IBM_Debater_(R)_arg_quality_rank_30k.zip’


2023-04-03 07:23:13 (4.24 MB/s) - ‘IBM_Debater_(R)_arg_quality_rank_30k.z

## **IMPORT DATASET**

In [3]:
import pandas as pd

df = pd.read_csv("./arg_quality_rank_30k.csv")
df.head()

Unnamed: 0,argument,topic,set,WA,MACE-P,stance_WA,stance_WA_conf
0,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,train,0.846165,0.297659,1,1.0
1,.a multi-party system would be too confusing a...,We should adopt a multi-party system,train,0.891271,0.726133,-1,1.0
2,\ero-tolerance policy in schools should not be...,We should adopt a zero-tolerance policy in sch...,dev,0.721192,0.396953,-1,1.0
3,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,train,0.730395,0.225212,-1,1.0
4,"100% agree, should they do that, it would be a...",We should abolish safe spaces,train,0.236686,0.004104,1,0.805517


### Split into Train and Test Sets

In [4]:
df_train = df[df["set"] != "test"].reset_index(drop=True) # Combine train and dev into train set
df_train = df_train.drop(["set"], axis=1)

df_test = df[df["set"] == "test"].reset_index(drop=True)
df_test = df_test.drop(["set"], axis=1)

### Display Dataset Metrics

In [5]:
import numpy as np

In [6]:
print(f"Length of dataset = {len(df)}")
print(f"Number of training data = {len(df_train)}")
print(f"Number of testing data = {len(df_test)}")

Length of dataset = 30497
Number of training data = 24182
Number of testing data = 6315


In [7]:
print(f"Number of Topics = {len(np.unique(df.topic))}")
print(f"Number of Topics in training data = {len(np.unique(df_train.topic))}")
print(f"Number of Topics in testing data = {len(np.unique(df_test.topic))}")

Number of Topics = 71
Number of Topics in training data = 56
Number of Topics in testing data = 15


## **TEXT CLEANING**

In [8]:
import re

import spacy

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from gensim.utils import simple_preprocess

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
stop_words = stopwords.words('english')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags=["NOUN", "ADJ", "VERB"]

In [10]:
def clean_text(text):
    text = text.lower() # Convert the text into lowercase
    text = text.replace('</br>', '') # Remove </br>
    text = text.replace('\n', '') # Remove \n
    
    # Remove quotes
    text = re.sub(r"\'", "", text) 
    text = re.sub(r"\"", "", text) 
    
    text = re.sub(r"[^\w]", " ", text) # Remove all symbols

    text = re.sub(r'[ ]{2,}', ' ', text) # Remove extra spaces
    text = re.sub(r'[ \t]+$', '', text) # Remove trailing white spaces

    text = simple_preprocess(str(text), deacc=True) # Tokenize the texts

    # Remove stopwords
    tokens = []
    for token in text:
        if token not in stop_words:
            tokens.append(token)
    text = tokens

    # Lemmatize the tokens
    text = " ".join(text)
    text = nlp(text)
    lemmatized_tokens = []
    for token in text:
        if token.pos_ in allowed_postags:
            lemmatized_tokens.append(token.lemma_)
    text = lemmatized_tokens

    text = " ".join(text)
    return text

In [11]:
df_train["argument"] = df_train["argument"].apply(clean_text)
df_test["argument"] = df_test["argument"].apply(clean_text)

In [12]:
df_train["argument"]

0        marriage keep time abandon old thinking bring ...
1        system confuse get consensus general public di...
2        ero tolerance policy school adopt circumstance...
3        people reach limit come quality life able end ...
4                                         agree good thing
                               ...                        
24177    zoo trap animal meaningless life amuse human o...
24178                               zoo treat animal close
24179                       zoo imprison animal cause harm
24180    zoo work educational center cause extinction a...
24181                       zoo help breed endanger specie
Name: argument, Length: 24182, dtype: object

## **FEATURE EXTRACTION**

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
# Build corpus using all texts
X = []
X.extend(df_train["argument"].tolist())
X.extend(df_test["argument"].tolist())

In [15]:
# Fit vectorizer
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(X)

In [16]:
# Transform the text
X = vectorizer.transform(X).toarray()
X_train = vectorizer.transform(df_train["argument"]).toarray()
X_test = vectorizer.transform(df_test["argument"]).toarray()

In [17]:
print(f"Shape of training feature vector: {X_train.shape}")

Shape of training feature vector: (24182, 8794)


In [18]:
# Extract vocabulary
vocab = vectorizer.vocabulary_

### Save Vectorizer

In [19]:
import pickle

with open('./tfidf_vectorizer', 'ab') as file:
    pickle.dump(vectorizer, file)

with open('/content/drive/MyDrive/tfidf_vectorizer', 'ab') as file:
    pickle.dump(vectorizer, file)

del vectorizer

### Load Vectorizer

In [20]:
with open('./tfidf_vectorizer', 'rb') as file:
    vectorizer = pickle.load(file)

## **TARGET EXTRACTION**

In [21]:
y = np.array(df["WA"])
y_train = np.array(df_train["WA"])
y_test = np.array(df_test["WA"])

In [22]:
print(f"Shape of training target vector: {y_train.shape}")

Shape of training target vector: (24182,)


## **LINEAR REGRESSION**

### Gridsearch

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

In [24]:
parameters = {
    "fit_intercept":(True, False), "copy_X":(True, False),
    "positive":(True, False)
}

lm_model = LinearRegression()

clf = GridSearchCV(
    lm_model, parameters, n_jobs=-1,
    scoring="neg_root_mean_squared_error",
)

In [25]:
# Take a random sample

import random

size = len(X_train) // 10

idxs = random.sample(range(0, len(X_train)), size)

clf_X = np.array([X_train[idx] for idx in idxs])
clf_y = np.array([y_train[idx] for idx in idxs])

In [26]:
# clf = clf.fit(clf_X, clf_y)

In [27]:
# print("The best parameters are:")
# best_params = clf.best_params_
# for key in best_params:
#     print(f"{key}=>{best_params[key]}")

### Train Model

In [28]:
model = LinearRegression(
    copy_X=True, fit_intercept=True, 
    positive=True, n_jobs=-1
)


history = model.fit(X_train, y_train)

lr_pred = model.predict(X_test)

### Evaluate Model

In [29]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [30]:
print("METRICS\tSCORE")

r2_val = r2_score(y_test, lr_pred)
print("R2:", end="\t")
print(f"{r2_val:>.4f}")

mae_val = mean_absolute_error(y_test, lr_pred)
print("MAE:", end="\t")
print(f"{mae_val:>.4f}")

mse_val = mean_squared_error(y_test, lr_pred, squared=True)
print("MSE:", end="\t")
print(f"{mse_val:>.4f}")

rmse_val = mean_squared_error(y_test, lr_pred, squared=False)
print("RMSE:", end="\t")
print(f"{rmse_val:>.4f}")

METRICS	SCORE
R2:	-0.2347
MAE:	0.1820
MSE:	0.0489
RMSE:	0.2210


### Save Model

In [31]:
import pickle

with open('./tfidf_lr_model', 'ab') as file:
    pickle.dump(model, file)

with open('/content/drive/MyDrive/tfidf_lr_model', 'ab') as file:
    pickle.dump(model, file)

del model

### Load Model

In [32]:
with open('./tfidf_lr_model', 'rb') as file:
    model = pickle.load(file)

In [33]:
lr_pred = model.predict(X_test)
rmse_val = mean_squared_error(y_test, lr_pred, squared=False)
print("RMSE:", end="\t")
print(f"{rmse_val:>.4f}")

RMSE:	0.2210


## **SUPPORT VECTOR MACHINE**

### Gridsearch

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

In [35]:
parameters = {
    "kernel":("linear", "poly", "rbf", "sigmoid"), 
    "degree":(2, 3, 4),
    "C":(0.1, 1),
}

svm_model = SVR()

clf = GridSearchCV(
    svm_model, parameters, n_jobs=-1,
    scoring="neg_root_mean_squared_error",
)

In [36]:
# Take a random sample

import random

size = len(X_train) // 10

idxs = random.sample(range(0, len(X_train)), size)

clf_X = np.array([X_train[idx] for idx in idxs])
clf_y = np.array([y_train[idx] for idx in idxs])

In [37]:
# clf = clf.fit(clf_X, clf_y)

In [38]:
# print("The best parameters are:")
# best_params = clf.best_params_
# for key in best_params:
#     print(f"{key}=>{best_params[key]}")

### Train Model

In [39]:
model = SVR(
    C=0.1, degree=2, kernel="rbf"
)

history = model.fit(X_train, y_train)

svm_pred = model.predict(X_test)

### Evaluate Model

In [40]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [41]:
print("METRICS\tSCORE")

r2_val = r2_score(y_test, svm_pred)
print("R2:", end="\t")
print(f"{r2_val:>.4f}")

mae_val = mean_absolute_error(y_test, svm_pred)
print("MAE:", end="\t")
print(f"{mae_val:>.4f}")

mse_val = mean_squared_error(y_test, svm_pred, squared=True)
print("MSE:", end="\t")
print(f"{mse_val:>.4f}")

rmse_val = mean_squared_error(y_test, svm_pred, squared=False)
print("RMSE:", end="\t")
print(f"{rmse_val:>.4f}")

METRICS	SCORE
R2:	0.0823
MAE:	0.1523
MSE:	0.0363
RMSE:	0.1906


### Save Model

In [42]:
import pickle

with open('./tfidf_svm_model', 'ab') as file:
    pickle.dump(model, file)

with open('/content/drive/MyDrive/tfidf_svm_model', 'ab') as file:
    pickle.dump(model, file)

del model

### Load Model

In [43]:
with open('./tfidf_svm_model', 'rb') as file:
    model = pickle.load(file)

In [44]:
svm_pred = model.predict(X_test)
rmse_val = mean_squared_error(y_test, svm_pred, squared=False)
print("RMSE:", end="\t")
print(f"{rmse_val:>.4f}")

RMSE:	0.1906
