# Deep Learning Project
## Istanbul Technical University
### Computer Engineering Department
### BLG 527E - Deep Learning
### Spring 2023

<b>Student Name: Muhammed Rüşen Birben</b><br>
<b>Student ID: 150220755</b><br>
<b>Student Email: birben20@itu.edu.tr</b><br>

<b>Student Name: Ahmed Burak Ercan</b><br>
<b>Student ID: 150220749</b><br>
<b>Student Email: ercana20@itu.edu.tr</b><br>

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from os import makedirs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from utils import get_model, model_predict, get_label_output
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [14]:
import warnings
warnings.filterwarnings("ignore")

### Loading the Data

In [15]:
# Reading the data.
df = pd.read_csv('datasetV2.csv')
df.head()
df = df[:100]

In [16]:
X = df[:100].Text
y = df[:100].IsAI

### Model Development

In [17]:
def predict(model_name, X):
    # load model
    model, tokenizer = get_model(model_name)

    # sentences
    text = X.values.tolist()

    # predict
    predictions = model_predict(model, tokenizer, text)

    return predictions, model, tokenizer

In [18]:
def eval_model(y_predictons, y_true):
    #Evaluate the model by computing precision, recall and F1-score.
    pred_labels = get_label_output(y_predictons)

    accuracy = accuracy_score(y_true, pred_labels)
    recall = recall_score(y_true, pred_labels, average='macro')
    precision = precision_score(y_true, pred_labels, average='macro')
    f1 = f1_score(y_true, pred_labels, average='macro')
    
    conf_matrix = confusion_matrix(y_true, pred_labels)
    
    print('Accuracy: {:.2f}%'.format(accuracy*100))
    print('Recall: {:.2f}%'.format(recall*100))
    print('Precision: {:.2f}%'.format(precision*100))
    print('F1-score: {:.2f}%'.format(f1*100))

    return conf_matrix

In [19]:
predictions, model, tokenizer = predict("chatgpt-detector-lli-hc3", X)

Model already exists, loading...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [20]:
eval_model(predictions, y)

Accuracy: 59.00%
Recall: 54.68%
Precision: 54.62%
F1-score: 54.64%


array([[14, 20],
       [21, 45]], dtype=int64)

### Fine-Tuning

In [21]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [22]:
# Splitting the DataFrame into train and remaining datasets
train_df, remaining_df = train_test_split(df, test_size=0.30, random_state=42)

# Splitting the remaining dataset into validation and test datasets
val_df, test_df = train_test_split(remaining_df, test_size=0.4, random_state=42)


train_encodings = tokenizer(train_df['Text'].tolist(), padding=True, truncation=True, max_length=512)
val_encodings = tokenizer(val_df['Text'].tolist(), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(test_df['Text'].tolist(), padding=True, truncation=True, max_length=512)

train_dataset = TextDataset(train_encodings, train_df['IsAI'].tolist())
val_dataset = TextDataset(val_encodings, val_df['IsAI'].tolist())
test_dataset = TextDataset(test_encodings, test_df['IsAI'].tolist())

In [23]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"      # evaluate each `logging_steps`
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

# Train and evaluate
trainer.train()

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.3414738178253174, 'eval_runtime': 2.2329, 'eval_samples_per_second': 8.061, 'eval_steps_per_second': 1.344, 'epoch': 1.0}
{'loss': 1.5254, 'learning_rate': 1.0000000000000002e-06, 'epoch': 1.11}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.273801326751709, 'eval_runtime': 2.2422, 'eval_samples_per_second': 8.028, 'eval_steps_per_second': 1.338, 'epoch': 2.0}
{'loss': 1.0252, 'learning_rate': 2.0000000000000003e-06, 'epoch': 2.22}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 1.1937499046325684, 'eval_runtime': 2.269, 'eval_samples_per_second': 7.933, 'eval_steps_per_second': 1.322, 'epoch': 3.0}
{'train_runtime': 117.3427, 'train_samples_per_second': 1.79, 'train_steps_per_second': 0.23, 'train_loss': 1.1908508936564128, 'epoch': 3.0}


TrainOutput(global_step=27, training_loss=1.1908508936564128, metrics={'train_runtime': 117.3427, 'train_samples_per_second': 1.79, 'train_steps_per_second': 0.23, 'train_loss': 1.1908508936564128, 'epoch': 3.0})

In [24]:
predictions = trainer.predict(test_dataset)

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# softmax
predictons = F.softmax(torch.from_numpy(predictions.predictions), dim=1)
predictons = predictons.detach().numpy()


In [29]:
# argmax label
pred_labels = get_label_output(predictons)
y = test_df['IsAI'].tolist()

In [33]:
#eval_model(pred_labels, y) # y düzenle

accuracy = accuracy_score(y, pred_labels)
recall = recall_score(y, pred_labels, average='macro')
precision = precision_score(y, pred_labels, average='macro')
f1 = f1_score(y, pred_labels, average='macro')

conf_matrix = confusion_matrix(y, pred_labels)

print('Accuracy: {:.2f}%'.format(accuracy*100))
print('Recall: {:.2f}%'.format(recall*100))
print('Precision: {:.2f}%'.format(precision*100))
print('F1-score: {:.2f}%'.format(f1*100))


Accuracy: 41.67%
Recall: 37.50%
Precision: 38.57%
F1-score: 37.78%


In [34]:
pred_labels

array([1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1], dtype=int64)

In [35]:
y

[0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1]