# **MODEL GENERALISABILITY**

## **Import Libraries**

In [1]:
import re
import pickle
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from tqdm import tqdm
from sklearn.metrics import classification_report

from model import CustomModel
from preprocess import preprocess_pipeline, vectorizer

## **Load Data**

In [2]:
df = pd.read_csv("data/Arguments_6L_MT.csv", delimiter=",")

In [3]:
df.head()

Unnamed: 0,set,argument_EN,topic_EN,quality_score_EN,stance_label_EN,stance_conf_EN,argument_ES,topic_ES,argument_FR,topic_FR,argument_IT,topic_IT,argument_DE,topic_DE,argument_NL,topic_NL
0,train,"""marriage"" isn't keeping up with the times. a...",We should abandon marriage,0.846165,1,1.0,"""matrimonio"" no se mantiene con los tiempos. ...",Deberíamos abandonar el matrimonio.,"""Mariage"" n'est pas en accord avec les temps. ...",Nous devrions abandonner le mariage,"Il ""matrimonio"" non si tiene con i tempi. abb...",Dovremmo abbandonare il matrimonio,"""Ehe"" hält sich nicht mit den Zeiten auf. das...",Wir sollten die Ehe aufgeben.,'huwelijk' houdt de tijden niet in. het oude ...,We moeten het huwelijk verlaten.
1,train,.a multi-party system would be too confusing a...,We should adopt a multi-party system,0.891271,-1,1.0,.un sistema multipartidista sería demasiado co...,Debemos adoptar un sistema pluripartidista,Un système multipartite serait trop déroutant ...,Nous devrions adopter un système multipartite,.un sistema multi - partitico sarebbe troppo c...,Dovremmo adottare un sistema multi - partitico,.Ein Mehrparteiensystem wäre zu verwirrend und...,"Wir sollten ein Mehrparteiensystem einführen,",Een meerpartijensysteem zou te verwarrend zijn...,We moeten een meerpartijenstelsel aannemen
2,train,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,0.730395,-1,1.0,""" Las personas alcanzan su límite cuando se tr...",El suicidio asistido debe ser un delito penal,""" Les gens atteignent leur limite lorsqu'il s'...",Le suicide assisté devrait être une infraction...,§ le persone raggiungono il loro limite quando...,Il suicidio assistito dovrebbe essere un reato,""" Menschen erreichen ihre Grenze, wenn es um i...",Assistierter Suizid sollte ein Straftatbestand...,¶ mensen bereiken hun grens als het gaat om hu...,Geassisteerde zelfmoord moet een strafbaar fei...
3,train,"100% agree, should they do that, it would be a...",We should abolish safe spaces,0.236686,1,0.805517,"100% de acuerdo, si lo hacen, sería una buena ...",Deberíamos abolir los espacios seguros.,"100% d'accord, s'ils le font, ce serait une bo...",Nous devrions abolir les espaces sûrs,"100% d'accordo, dovrebbero farlo, sarebbe una ...",Dovremmo abolire gli spazi sicuri,"100% stimmen zu, sollten sie das tun, wäre es ...",Wir sollten sichere Räume abschaffen,"100% akkoord, als ze dat doen, zou het een goe...",We moeten veilige ruimtes afschaffen.
4,train,A ban on naturopathy creates a cohesive front ...,We should ban naturopathy,0.753805,1,1.0,Una prohibición de la naturopathy crea un fren...,Deberíamos prohibir la naturopatía.,L'interdiction de la naturopathie crée un fron...,Nous devrions interdire la naturopathie,Un divieto di naturopatia crea un fronte coeso...,Dovremmo vietare la naturopatia,Ein Verbot der Naturheilkunde schafft eine zus...,Wir sollten Naturopathie verbieten,Een verbod op natuurgeneeskunde creëert een sa...,We moeten naturopathie verbieden.


## **Data Cleaning**

In [4]:
# Extract the necessary columns
df = df[["argument_EN", "quality_score_EN"]]

In [5]:
def set_quality(score):
    if score <= 0.33:
        return "Low"
    if score > 0.33 and score <= 0.66:
        return "Average"
    return "High"

In [6]:
df["quality"] = df["quality_score_EN"].apply(set_quality)

In [7]:
df.head()

Unnamed: 0,argument_EN,quality_score_EN,quality
0,"""marriage"" isn't keeping up with the times. a...",0.846165,High
1,.a multi-party system would be too confusing a...,0.891271,High
2,`people reach their limit when it comes to the...,0.730395,High
3,"100% agree, should they do that, it would be a...",0.236686,Low
4,A ban on naturopathy creates a cohesive front ...,0.753805,High


## **Data Preprocessing**

In [8]:
stop_words = set(stopwords.words("english"))
english_stopwords = stopwords.words("english")
english_stemmer = SnowballStemmer("english")

In [9]:
def clean_text(text):
    text = text.replace('', '') # Remove 
    text = re.sub(r'[^\w]', ' ', text) # Remove symbols
    text = re.sub(r'[ ]{2,}', ' ', text) # Remove extra spaces
    text = re.sub(r'[ \t]+$', '', text) # Remove trailing white spaces
    tokens = []
    for token in text.split():
        if token not in stop_words:
            token = english_stemmer.stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [10]:
text = df["argument_EN"]
cleaned_text = [clean_text(text) for text in text]
text = cleaned_text

## **Load Model**

In [11]:
model = CustomModel()

2022-12-19 08:36:32.370316: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-19 08:36:32.853841: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-19 08:36:32.853874: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-19 08:36:34.326026: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

Keras model archive loading:
File Name                                             Modified             Size
variables.h5                                   2022-12-16 09:33:44       644152
config.json                                    2022-12-16 09:33:44         3320
metadata.json                                  2022-12-16 09:33:44           64
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......dropout_2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........2
.........3
.........4
.........5
.........6
.........7
.........8
...vars
Ke

2022-12-19 08:36:35.877501: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-19 08:36:35.877897: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-19 08:36:35.878019: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-12-19 08:36:35.878090: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-12-19 08:36:35.878156: W tensorflow/c

config.json                                    2022-12-16 09:33:44         3328
metadata.json                                  2022-12-16 09:33:44           64
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dense_2
.........vars
............0
............1
......dense_3
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......dropout_2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........2
.........3
.........4
.........5
.........6
.........7
.........8
...vars
Keras model archive loading:
File Name                                             Modified             Size
variables.h5                                   2022-12-16 09:33:44       644152
co

## **Get Prediction**

In [12]:
# Load label encoder
label_encoder = pickle.load(open("encoders/label_encoder.pkl", "rb"))

In [13]:
# Load vectorizer
count_vectorizer = pickle.load(open("vectorizers/count_vectorizer.pkl", "rb"))

In [14]:
vector = count_vectorizer.transform(text).toarray()
# vector = count_vectorizer.transform(text)

In [15]:
pred = model.predict(vector)

2022-12-19 08:36:37.113034: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 488195976 exceeds 10% of free system memory.
2022-12-19 08:36:39.888836: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 488195976 exceeds 10% of free system memory.
2022-12-19 08:36:42.274647: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 488195976 exceeds 10% of free system memory.


In [16]:
pred = label_encoder.inverse_transform(pred)

for idx, x in enumerate(pred):
    pred[idx] = x[x.find('(')+1:x.find(')')]

## **Evaluate Prediction**

In [17]:
print(classification_report(df["quality"], pred))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     Average       0.13      0.02      0.04      5942
        High       0.00      0.00      0.00     23700
         Low       0.03      0.99      0.06       855

    accuracy                           0.03     30497
   macro avg       0.05      0.34      0.03     30497
weighted avg       0.03      0.03      0.01     30497



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
