## TF-IDF + Classifiers

In [1]:
import pandas as pd
df = pd.read_json("./data/subtaskC_train_dev.jsonl", lines=True)
print(len(df))

19527


In [2]:
df.columns

Index(['uuid', 'id', 'source', 'title', 'abstract', 'prompt',
       'full_human_review', 'human_end_boundary', 'cut_at_sentence',
       'truncated_human_review', 'machine_review', 'mixed_review',
       'domain_model', 'split'],
      dtype='object')

In [3]:
df.shape

(19527, 14)

In [4]:
df.full_human_review.isna().sum()

0

In [5]:
df.mixed_review.isna().sum()

0

In [6]:
df.machine_review.isna().sum()

62

In [7]:
df = df.dropna(subset=['machine_review'])

In [8]:
#shuffle rows
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
df.shape

(19465, 14)

In [10]:
19465/3

6488.333333333333

In [21]:
df.iloc[0:6488]['full_human_review']

0       The paper provides an exposition of multiple w...
1       Dear reviewers, we made the following changes ...
2       The authors construct a new dataset of 1200 Si...
3       After discussion, the reviewers unanimously re...
4       The manuscript is a bit scattered and hard to ...
                              ...                        
6483    The paper describes a method to evaluate gener...
6484    In light of the detailed author responses and ...
6485    The manuscript is a bit scattered and hard to ...
6486    The paper looks at the problem of transferring...
6487    My main objection with this work is that it op...
Name: full_human_review, Length: 6488, dtype: object

In [24]:
train_human.values

array(['The paper provides an exposition of multiple ways of learning in implicit generative models, of which generative adversarial networks are an example. The paper is very clear, the exposition is insightful, and the presented material is clearly important.\n\nIt is hard to assess "novelty" of this work, as the individual pieces are not novel, and yet the exposition of all of them in the same space with clear outline of the connections between them is novel.\n\nI believe this work is significant - it provides a bridge for language and methods used in multiple parts of statistics and machine learning. This has the potential to accelerate progress.\n\nI recommend publishing this paper at ICLR, even though it is not the "typical" paper that get published at this conference (in that it doesn\'t offer empirical validation, nor makes a particular claim about relative merits of different methods).',
       'Dear reviewers, we made the following changes to our paper:\n\n– added direct comp

In [29]:
train_human = list(df.iloc[0:6488]['full_human_review'].values)
train_mixed = list(df.iloc[6488:6488*2]['mixed_review'].values)
train_machine = list(df.iloc[6488*2:]['machine_review'].values)

In [30]:
train_set = train_human + train_mixed + train_machine

In [31]:
len(train_set)

19465

In [32]:
labels = [0]*6488 + [1]*6488 + [2]*6489
len(labels)

19465

In [33]:
# From the notebook of the authors
label_map = {
    0: "human",
    1: "mixed",
    2: "machine",
}

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_set, labels, test_size=0.3, random_state=38, shuffle=True)

In [57]:
X_train_text = X_train
X_test_text = X_test

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)

In [59]:
X_test = vectorizer.transform(X_test)

## Logreg

In [60]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=38)

clf.fit(X_train, y_train)

In [61]:
y_pred = clf.predict(X_test)

In [62]:
label_map = list(dict(sorted(label_map.items())).values())
label_map

AttributeError: 'list' object has no attribute 'items'

In [63]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.87      0.94      0.90      1931
       mixed       0.74      0.65      0.69      1961
     machine       0.77      0.79      0.78      1948

    accuracy                           0.80      5840
   macro avg       0.79      0.80      0.79      5840
weighted avg       0.79      0.80      0.79      5840



## Boosting

In [64]:
from sklearn.ensemble import GradientBoostingClassifier

In [65]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0, verbose=True).fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.9638           54.55s
         2           0.9002           52.95s
         3           0.8559           52.10s
         4           0.8241           51.46s
         5           0.8009           50.83s
         6           0.7821           50.24s
         7           0.7629           49.65s
         8           0.7496           49.10s
         9           0.7360           48.58s
        10           0.7224           48.03s
        20           0.6429           42.54s
        30           0.5980           37.18s
        40           0.5672           31.91s
        50           0.5417           26.57s
        60           0.5204           21.24s
        70           0.5003           15.95s
        80           0.4822           10.65s
        90           0.4657            5.33s
       100           0.4511            0.00s


In [66]:
y_pred = clf.predict(X_test)

In [67]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.85      0.90      0.88      1931
       mixed       0.71      0.63      0.67      1961
     machine       0.74      0.78      0.76      1948

    accuracy                           0.77      5840
   macro avg       0.77      0.77      0.77      5840
weighted avg       0.77      0.77      0.77      5840



## SVC

In [68]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
clf = SVC(gamma='auto', verbose=True)

In [69]:
X_train.shape

(13625, 5000)

In [70]:
clf.fit(X_train[0:3000], y_train[0:3000])

[LibSVM]*.
*
optimization finished, #iter = 1008
obj = -1989.458995, rho = 0.991063
nSV = 1992, nBSV = 1992
*
optimization finished, #iter = 996
obj = -1987.071318, rho = 0.006730
nSV = 1992, nBSV = 1992
*
optimization finished, #iter = 996
obj = -1990.548245, rho = -0.987223
nSV = 1992, nBSV = 1992
Total nSV = 3000


In [71]:
y_pred = clf.predict(X_test)

In [72]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.00      0.00      0.00      1931
       mixed       0.34      1.00      0.50      1961
     machine       0.00      0.00      0.00      1948

    accuracy                           0.34      5840
   macro avg       0.11      0.33      0.17      5840
weighted avg       0.11      0.34      0.17      5840



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Word2Vec

In [64]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m65.6 MB/s[0

In [73]:
# from gensim.models import Word2Vec

# sentences = [sentence.split() for sentence in X_train_text]
# w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

In [74]:
# w2v_model.save("word2vec_mixed.model")

In [75]:
from gensim.models import Word2Vec

model = Word2Vec.load("word2vec_mixed.model")

In [76]:
import numpy as np

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train_vector = np.array([vectorize(sentence) for sentence in X_train_text])
X_test_vector = np.array([vectorize(sentence) for sentence in X_test_text])

## Logreg

In [77]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_vector, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
y_pred = clf.predict(X_test_vector)

In [79]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.81      0.90      0.85      1931
       mixed       0.60      0.52      0.56      1961
     machine       0.68      0.69      0.69      1948

    accuracy                           0.70      5840
   macro avg       0.69      0.70      0.70      5840
weighted avg       0.69      0.70      0.70      5840



## Boosting

In [80]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0, verbose=True).fit(X_train_vector, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.9398           57.73s
         2           0.8733           57.46s
         3           0.8289           57.50s
         4           0.7959           56.69s
         5           0.7711           56.06s
         6           0.7475           55.37s
         7           0.7285           54.67s
         8           0.7129           54.01s
         9           0.6995           53.36s
        10           0.6875           52.74s
        20           0.6237           46.78s
        30           0.5896           40.90s
        40           0.5671           35.07s
        50           0.5501           29.22s
        60           0.5362           23.37s
        70           0.5243           17.53s
        80           0.5136           11.68s
        90           0.5039            5.84s
       100           0.4951            0.00s


In [81]:
y_pred = clf.predict(X_test_vector)

In [82]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.84      0.90      0.87      1931
       mixed       0.66      0.62      0.64      1961
     machine       0.75      0.74      0.74      1948

    accuracy                           0.75      5840
   macro avg       0.75      0.75      0.75      5840
weighted avg       0.75      0.75      0.75      5840



## KNN

In [83]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)

In [84]:
clf.fit(X_train_vector, y_train)

In [85]:
y_pred = clf.predict(X_test_vector)

In [86]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.76      0.93      0.84      1931
       mixed       0.56      0.59      0.58      1961
     machine       0.75      0.54      0.63      1948

    accuracy                           0.69      5840
   macro avg       0.69      0.69      0.68      5840
weighted avg       0.69      0.69      0.68      5840



## SVC

In [87]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
clf = SVC(gamma='auto', verbose=True)

In [88]:
clf.fit(X_train_vector[0:2000], y_train[0:2000])

[LibSVM]*
optimization finished, #iter = 456
obj = -670.312878, rho = 0.781101
nSV = 796, nBSV = 778
*
optimization finished, #iter = 347
obj = -417.596388, rho = 5.521184
nSV = 553, nBSV = 533
*
optimization finished, #iter = 606
obj = -970.515764, rho = 8.208092
nSV = 1074, nBSV = 1060
Total nSV = 1626


In [89]:
y_pred = clf.predict(X_test_vector)

In [92]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.74      0.89      0.81      1931
       mixed       0.59      0.58      0.59      1961
     machine       0.74      0.61      0.67      1948

    accuracy                           0.69      5840
   macro avg       0.69      0.69      0.69      5840
weighted avg       0.69      0.69      0.69      5840



## Glove

In [93]:
# First we will prepare our train dataset

In [97]:
corpus = pd.Series(X_train_text).apply(lambda x: x.replace('\n', ' '))
corpus = corpus.apply(lambda x: x.replace('<unk>', '')).values

In [98]:
corpus = '\n'.join(list(corpus))

In [99]:
len(corpus)

16506014

In [100]:
with open('train_corpus_mixed.txt', "w+") as f:
    f.write(corpus)

In [101]:
# I performed the training
result_path = "GloVe/vectors_mixed.txt"

In [102]:
# Load GloVe embeddings into a dictionary
def load_embeddings(file_path):
    embeddings = {}
    bad_values = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
            except ValueError:
                bad_values += 1
    print(bad_values)
    return embeddings

glove_embeddings_path = result_path  # Adjust the path to your downloaded GloVe file
glove_embeddings = load_embeddings(glove_embeddings_path)

1


In [103]:
len(glove_embeddings)

20134

In [104]:
# for some reason the dimensions for those values are incorrect, lets just remove them
counter  = 0
for i in m:
    if len(i) != 50:
        counter +=1
print(counter)

NameError: name 'm' is not defined

In [105]:
from copy import deepcopy

ite = deepcopy(glove_embeddings)

In [106]:
from copy import copy
for name, i in ite.items():
    if len(i) != 50:
        glove_embeddings.pop(name)

In [107]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [glove_embeddings[word] for word in words if word in glove_embeddings]
    if len(words_vecs) == 0:
        return np.zeros(50)
    words_vecs = np.array(words_vecs, dtype='object')
    return words_vecs.mean(axis=0)

X_train_vector = np.array([vectorize(sentence) for sentence in X_train_text])
X_test_vector = np.array([vectorize(sentence) for sentence in X_test_text])

In [108]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_vector, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [109]:
y_pred = clf.predict(X_test_vector)

In [110]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.77      0.87      0.82      1931
       mixed       0.57      0.49      0.52      1961
     machine       0.67      0.67      0.67      1948

    accuracy                           0.68      5840
   macro avg       0.67      0.68      0.67      5840
weighted avg       0.67      0.68      0.67      5840



## Boosting

In [111]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0, verbose=True).fit(X_train_vector, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.9201           29.15s
         2           0.8596           28.76s
         3           0.8251           28.41s
         4           0.7972           28.10s
         5           0.7723           27.81s
         6           0.7535           27.51s
         7           0.7358           27.21s
         8           0.7238           26.95s
         9           0.7126           26.69s
        10           0.7021           26.42s
        20           0.6448           23.50s
        30           0.6134           20.55s
        40           0.5933           17.60s
        50           0.5781           14.66s
        60           0.5644           11.74s
        70           0.5539            8.80s
        80           0.5454            5.87s
        90           0.5374            2.93s
       100           0.5312            0.00s


In [112]:
y_pred = clf.predict(X_test_vector)

In [113]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.81      0.87      0.84      1931
       mixed       0.63      0.61      0.62      1961
     machine       0.75      0.71      0.73      1948

    accuracy                           0.73      5840
   macro avg       0.73      0.73      0.73      5840
weighted avg       0.73      0.73      0.73      5840



## KNN

In [114]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)

In [115]:
clf.fit(X_train_vector, y_train)

In [116]:
y_pred = clf.predict(X_test_vector)

In [117]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.75      0.93      0.83      1931
       mixed       0.57      0.54      0.55      1961
     machine       0.73      0.60      0.66      1948

    accuracy                           0.69      5840
   macro avg       0.68      0.69      0.68      5840
weighted avg       0.68      0.69      0.68      5840



## SVC

In [118]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
clf = SVC(gamma='auto', verbose=True)

In [119]:
clf.fit(X_train_vector, y_train)

[LibSVM]..
*
optimization finished, #iter = 2783
obj = -4494.852749, rho = 3.401376
nSV = 5073, nBSV = 5050
.
*
optimization finished, #iter = 1856
obj = -2544.898259, rho = 14.171991
nSV = 3202, nBSV = 3167
...
*.
*
optimization finished, #iter = 3852
obj = -6459.739614, rho = 17.994699
nSV = 6925, nBSV = 6897
Total nSV = 10558


In [120]:
y_pred = clf.predict(X_test_vector)

In [121]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

       human       0.75      0.90      0.82      1931
       mixed       0.60      0.53      0.56      1961
     machine       0.72      0.66      0.69      1948

    accuracy                           0.69      5840
   macro avg       0.69      0.69      0.69      5840
weighted avg       0.69      0.69      0.69      5840

