## TF-IDF + Classifiers

In [None]:
"./data/SubtaskC_train_dev.jsonl"

In [20]:
import pandas as pd
df = pd.read_json("./data/SubtaskB.jsonl", lines=True)
print(len(df))

122811


In [23]:
df.label.value_counts()

0    22892
3    17340
4    17332
1    17179
5    17046
2    16678
6    14344
Name: label, dtype: int64

In [24]:
df.model.value_counts()

bloomz           17332
human            17179
chatGPT          16892
cohere           16678
gpt4             14344
davinci          14340
dolly            14046
gpt-3.5-turbo     6000
davinci-003       3000
dolly-v2-12b      3000
Name: model, dtype: int64

In [39]:
# From the notebook of the authors
label_map = {
    0: "chatGPT",
    1: "human",
    2: "cohere",
    3: "davinci",
    4: "bloomz",
    5: "dolly",
    6: "gpt-4",
}

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text, df.label, test_size=0.3, random_state=38)

In [31]:
X_train_text = X_train
X_test_text = X_test

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)

In [33]:
X_test = vectorizer.transform(X_test)

## Logreg

In [34]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=38)

clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
y_pred = clf.predict(X_test)

In [49]:
label_map = list(dict(sorted(label_map.items())).values())
label_map

['chatGPT', 'human', 'cohere', 'davinci', 'bloomz', 'dolly', 'gpt-4']

In [50]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

     chatGPT       0.78      0.83      0.81      6870
       human       0.78      0.79      0.78      5193
      cohere       0.73      0.73      0.73      4971
     davinci       0.80      0.75      0.77      5237
      bloomz       0.95      0.97      0.96      5264
       dolly       0.69      0.67      0.68      5019
       gpt-4       0.83      0.81      0.82      4290

    accuracy                           0.80     36844
   macro avg       0.79      0.79      0.79     36844
weighted avg       0.79      0.80      0.79     36844



## Boosting

In [51]:
from sklearn.ensemble import GradientBoostingClassifier

In [54]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0, verbose=True).fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.7466           21.05m
         2           1.6303           20.71m
         3           1.5670           20.47m
         4           1.5129           20.25m
         5           1.4653           20.03m
         6           1.4291           19.80m
         7           1.3945           19.59m
         8           1.3608           19.38m
         9           1.3296           19.17m
        10           1.3018           18.95m
        20           1.1154           16.84m
        30           1.0102           14.72m
        40           0.9341           12.61m
        50           0.8779           10.51m
        60           0.8303            8.40m
        70           0.7953            6.30m
        80           0.7658            4.20m
        90           0.7397            2.10m
       100           0.7189            0.00s


In [55]:
y_pred = clf.predict(X_test)

In [56]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

     chatGPT       0.75      0.77      0.76      6870
       human       0.70      0.68      0.69      5193
      cohere       0.63      0.64      0.63      4971
     davinci       0.67      0.64      0.65      5237
      bloomz       0.92      0.94      0.93      5264
       dolly       0.57      0.58      0.57      5019
       gpt-4       0.78      0.76      0.77      4290

    accuracy                           0.72     36844
   macro avg       0.72      0.72      0.72     36844
weighted avg       0.72      0.72      0.72     36844



## SVC

In [57]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
clf = SVC(gamma='auto', verbose=True)

In [58]:
X_train.shape

(85967, 5000)

In [60]:
clf.fit(X_train[0:10000], y_train[0:10000])

  clf.fit(X_train[0:10000], y_train[0:10000])


[LibSVM].
*
optimization finished, #iter = 1397
obj = -2789.011311, rho = -0.998392
nSV = 2794, nBSV = 2794
.
*
optimization finished, #iter = 1363
obj = -2720.086665, rho = -0.993038
nSV = 2722, nBSV = 2722
.
*
optimization finished, #iter = 1444
obj = -2874.452305, rho = -0.996864
nSV = 2876, nBSV = 2876
.
*
optimization finished, #iter = 1425
obj = -2835.178902, rho = -0.972090
nSV = 2850, nBSV = 2850
.
*
optimization finished, #iter = 1387
obj = -2763.636164, rho = -0.992364
nSV = 2766, nBSV = 2766
.
*
optimization finished, #iter = 1156
obj = -2306.816967, rho = -0.997888
nSV = 2308, nBSV = 2308
.
*
optimization finished, #iter = 1361
obj = -2717.156712, rho = -0.973957
nSV = 2722, nBSV = 2722
.
*
optimization finished, #iter = 1433
obj = -2789.087047, rho = 0.994506
nSV = 2794, nBSV = 2794
.
*
optimization finished, #iter = 1405
obj = -2779.026397, rho = 0.996868
nSV = 2794, nBSV = 2794
.
*
optimization finished, #iter = 1383
obj = -2762.221302, rho = -0.970904
nSV = 2766, nBSV =

In [61]:
y_pred = clf.predict(X_test)

In [62]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred), target_names = label_map)

              precision    recall  f1-score   support

           0       0.19      1.00      0.31      6870
           1       0.00      0.00      0.00      5193
           2       0.00      0.00      0.00      4971
           3       0.00      0.00      0.00      5237
           4       0.00      0.00      0.00      5264
           5       0.00      0.00      0.00      5019
           6       0.00      0.00      0.00      4290

    accuracy                           0.19     36844
   macro avg       0.03      0.14      0.04     36844
weighted avg       0.03      0.19      0.06     36844



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Word2Vec

In [64]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m65.6 MB/s[0

In [65]:
# from gensim.models import Word2Vec

# sentences = [sentence.split() for sentence in X_train_text]
# w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

In [66]:
# w2v_model.save("word2vec_multiclass.model")

In [67]:
from gensim.models import Word2Vec

model = Word2Vec.load("word2vec_multiclass.model")

In [68]:
import numpy as np

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train_vector = np.array([vectorize(sentence) for sentence in X_train_text])
X_test_vector = np.array([vectorize(sentence) for sentence in X_test_text])

## Logreg

In [69]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_vector, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [70]:
y_pred = clf.predict(X_test_vector)

In [72]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

     chatGPT       0.57      0.64      0.60      6870
       human       0.61      0.66      0.63      5193
      cohere       0.48      0.45      0.46      4971
     davinci       0.51      0.47      0.49      5237
      bloomz       0.91      0.94      0.92      5264
       dolly       0.43      0.39      0.41      5019
       gpt-4       0.61      0.60      0.61      4290

    accuracy                           0.60     36844
   macro avg       0.59      0.59      0.59     36844
weighted avg       0.59      0.60      0.59     36844



## Boosting

In [73]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0, verbose=True).fit(X_train_vector, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.8152           17.76m
         2           1.7392           17.55m
         3           1.6702           17.35m
         4           1.6203           17.18m
         5           1.5819           17.00m
         6           1.5503           16.81m
         7           1.5233           16.64m
         8           1.4983           16.47m
         9           1.4739           16.29m
        10           1.4556           16.11m
        20           1.3310           14.32m
        30           1.2555           12.53m
        40           1.2036           10.73m
        50           1.1647            8.94m
        60           1.1342            7.15m
        70           1.1076            5.36m
        80           1.0854            3.58m
        90           1.0671            1.79m
       100           1.0486            0.00s


In [74]:
y_pred = clf.predict(X_test_vector)

In [75]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

     chatGPT       0.56      0.62      0.59      6870
       human       0.60      0.67      0.63      5193
      cohere       0.53      0.51      0.52      4971
     davinci       0.50      0.43      0.46      5237
      bloomz       0.86      0.90      0.88      5264
       dolly       0.43      0.39      0.41      5019
       gpt-4       0.60      0.54      0.57      4290

    accuracy                           0.59     36844
   macro avg       0.58      0.58      0.58     36844
weighted avg       0.58      0.59      0.58     36844



## KNN

In [76]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)

In [77]:
clf.fit(X_train_vector, y_train)

In [78]:
y_pred = clf.predict(X_test_vector)

In [79]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

     chatGPT       0.51      0.71      0.59      6870
       human       0.52      0.80      0.63      5193
      cohere       0.70      0.46      0.55      4971
     davinci       0.44      0.40      0.42      5237
      bloomz       0.91      0.84      0.87      5264
       dolly       0.55      0.32      0.40      5019
       gpt-4       0.64      0.51      0.57      4290

    accuracy                           0.59     36844
   macro avg       0.61      0.58      0.58     36844
weighted avg       0.61      0.59      0.58     36844



## SVC

In [93]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
clf = SVC(gamma='auto', verbose=True)

In [95]:
clf.fit(X_train_vector[0:2000], y_train[0:2000])

[LibSVM]*
optimization finished, #iter = 955
obj = -1227.250966, rho = 6.655179
nSV = 1446, nBSV = 1402
Total nSV = 1446


  clf.fit(X_train_vector[0:2000], y_train[0:2000])


In [96]:
y_pred = clf.predict(X_test_vector)

In [97]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.82      0.77     19574
           1       0.85      0.77      0.81     26269

    accuracy                           0.79     45843
   macro avg       0.79      0.79      0.79     45843
weighted avg       0.80      0.79      0.79     45843



## Glove

In [98]:
# First we will prepare our train dataset

In [81]:
corpus = X_train_text.apply(lambda x: x.replace('\n', ' '))
corpus = X_train_text.apply(lambda x: x.replace('<unk>', '')).values

In [82]:
corpus = '\n'.join(list(corpus))

In [83]:
len(corpus)

178074286

In [84]:
with open('train_corpus_multiclass.txt', "w+") as f:
    f.write(corpus)

In [85]:
# I performed the training
result_path = "GloVe/vectors_multiclass.txt"

In [86]:
# Load GloVe embeddings into a dictionary
def load_embeddings(file_path):
    embeddings = {}
    bad_values = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
            except ValueError:
                bad_values += 1
    print(bad_values)
    return embeddings

glove_embeddings_path = result_path  # Adjust the path to your downloaded GloVe file
glove_embeddings = load_embeddings(glove_embeddings_path)

33


In [87]:
len(glove_embeddings)

133459

In [88]:
# for some reason the dimensions for those values are incorrect, lets just remove them
counter  = 0
for i in m:
    if len(i) != 50:
        counter +=1
print(counter)

NameError: name 'm' is not defined

In [89]:
from copy import deepcopy

ite = deepcopy(glove_embeddings)

In [90]:
from copy import copy
for name, i in ite.items():
    if len(i) != 50:
        glove_embeddings.pop(name)

In [91]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [glove_embeddings[word] for word in words if word in glove_embeddings]
    if len(words_vecs) == 0:
        return np.zeros(50)
    words_vecs = np.array(words_vecs, dtype='object')
    return words_vecs.mean(axis=0)

X_train_vector = np.array([vectorize(sentence) for sentence in X_train_text])
X_test_vector = np.array([vectorize(sentence) for sentence in X_test_text])

In [92]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_vector, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [95]:
y_pred = clf.predict(X_test_vector)

In [96]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

     chatGPT       0.45      0.58      0.51      6870
       human       0.51      0.60      0.55      5193
      cohere       0.35      0.27      0.30      4971
     davinci       0.36      0.28      0.32      5237
      bloomz       0.83      0.89      0.86      5264
       dolly       0.36      0.30      0.33      5019
       gpt-4       0.46      0.44      0.45      4290

    accuracy                           0.49     36844
   macro avg       0.47      0.48      0.47     36844
weighted avg       0.48      0.49      0.48     36844



## Boosting

In [97]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0, verbose=True).fit(X_train_vector, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.8388            8.86m
         2           1.7704            8.78m
         3           1.7236            8.68m
         4           1.6827            8.58m
         5           1.6473            8.48m
         6           1.6190            8.39m
         7           1.5914            8.30m
         8           1.5682            8.21m
         9           1.5505            8.12m
        10           1.5342            8.04m
        20           1.4198            7.14m
        30           1.3573            6.26m
        40           1.3166            5.37m
        50           1.2855            4.47m
        60           1.2614            3.58m
        70           1.2422            2.69m
        80           1.2241            1.79m
        90           1.2095           53.70s
       100           1.1959            0.00s


In [100]:
y_pred = clf.predict(X_test_vector)

In [101]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

     chatGPT       0.51      0.59      0.54      6870
       human       0.54      0.64      0.59      5193
      cohere       0.45      0.39      0.42      4971
     davinci       0.40      0.34      0.37      5237
      bloomz       0.80      0.86      0.83      5264
       dolly       0.38      0.33      0.36      5019
       gpt-4       0.52      0.47      0.50      4290

    accuracy                           0.52     36844
   macro avg       0.51      0.52      0.51     36844
weighted avg       0.52      0.52      0.52     36844



## KNN

In [102]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)

In [103]:
clf.fit(X_train_vector, y_train)

In [104]:
y_pred = clf.predict(X_test_vector)

Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7f23de71be20>
Traceback (most recent call last):
  File "/home/user/conda/lib/python3.11/site-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/home/user/conda/lib/python3.11/site-packages/threadpoolctl.py", line 1175, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/home/user/conda/lib/python3.11/site-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/conda/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: /home/user/conda/lib/python3.11/site-packages/sc

In [106]:
print(classification_report(y_test, y_pred, target_names = label_map))

              precision    recall  f1-score   support

     chatGPT       0.48      0.68      0.56      6870
       human       0.48      0.80      0.60      5193
      cohere       0.61      0.38      0.47      4971
     davinci       0.39      0.36      0.38      5237
      bloomz       0.91      0.80      0.85      5264
       dolly       0.52      0.28      0.36      5019
       gpt-4       0.58      0.43      0.50      4290

    accuracy                           0.54     36844
   macro avg       0.57      0.53      0.53     36844
weighted avg       0.56      0.54      0.54     36844



## SVC

In [202]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
clf = SVC(gamma='auto', verbose=True)

In [203]:
clf.fit(X_train_vector, y_train)

[LibSVM]...................................
*.
*
optimization finished, #iter = 36232
obj = -59137.966641, rho = 49.734867
nSV = 64282, nBSV = 64211
Total nSV = 64282


KeyboardInterrupt: 

In [None]:
y_pred = clf.predict(X_test_vector)

In [None]:
print(classification_report(y_test, y_pred))