In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import tensorflow as tf
from sklearn.metrics import accuracy_score

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt

import contractions

from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, TrainingArguments, Trainer

import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def read_data(mode):
    if mode == 0:
        text = input()
        label = input()
        df = pd.DataFrame({'text': text, 'truth_label': label}, index=range(len(label)))
        return df
    elif mode == 1:
        df = pd.read_csv('trail.csv')
        return df
    else:
        return print("Please Enter valid mode that is either 0 or 1")

In [6]:
def cnn_model(df , model_path, model_tokenizer, max_len):

    with open(model_tokenizer, "rb") as file:
        tokenizer = pickle.load(file)

    cnn_model = load_model(model_path)

    txt_test = df['text'].astype(str).values
    token_test = tokenizer.texts_to_sequences(txt_test)
    pad_token_test = pad_sequences(token_test, padding='post', maxlen=max_len)

    pred = cnn_model.predict(pad_token_test)
    preds_lbl = np.where(pred > 0.5,1,0)

    return preds_lbl

In [7]:
def roberta_model(df , model_path, model_tokenizer , max_len):

    df['sentences'] = df['text'].str.lower()
    df['sentences'] = df['sentences'].apply(lambda x: contractions.fix(x))
    df['sentences'] = df['sentences'].apply(lambda x: x.split('.'))
    # df_roberta = data_prep(df)
    checkpoint_path = model_path

    # Load the checkpoint
    checkpoint = torch.load(checkpoint_path , map_location=torch.device('cpu'))

    # Extract the state dictionary that corresponds to the model
    model_state_dict = checkpoint['model_state_dict']

    tokenizer = RobertaTokenizer.from_pretrained(model_tokenizer)
    # Initialize the model
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

    # Load the state dictionary into the model
    model.load_state_dict(model_state_dict)
    model.eval()
    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    def prepare_data(texts, tokenizer, max_length=max_len):
        encoding = tokenizer(texts, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
        return encoding
    
    predictions = []
    probabilities = []

    # Iterate over sampled data to predict and collect probabilities
    for index, row in df.iterrows():
        encoded_input = prepare_data(row['sentences'], tokenizer)
        encoded_input = {key: value.to(model.device) for key, value in encoded_input.items()}
        
        with torch.no_grad():
            outputs = model(**encoded_input)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
            predictions.append(probs.argmax(axis=-1)[0])
            probabilities.append(probs[0, 1])
    return predictions

In [8]:
def gpt_model(df, model_and_tokenizer_path , max_len):

    df['sentences'] = df['text'].str.lower()
    df['sentences'] = df['sentences'].apply(lambda x: contractions.fix(x))
    df['sentences'] = df['sentences'].apply(lambda x: x.split('.'))

    model_path = model_and_tokenizer_path  # Adjust to where your model is saved
    model = GPT2ForSequenceClassification.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model.eval()

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    prediction = []
    probability = []

    for index, row in df.iterrows():
        input_ids = tokenizer(row['sentences'], truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**input_ids)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).numpy()
            prediction.append(probs.argmax(axis=-1)[0])
            probability.append(probs[0, 1])
            
    return prediction

In [9]:
df = read_data(1)

In [10]:
pred_cnn = cnn_model(df,model_path=r'models_and_tokenizers\cnn\cnn_50k.keras', model_tokenizer=r'models_and_tokenizers\cnn\tokenizer_cnn.pickle', max_len = 1000)
pred_roberta = roberta_model(df , model_path = r'models_and_tokenizers\roberta\checkpoint_epoch_3_roberta.pth', model_tokenizer = r'models_and_tokenizers\roberta', max_len = 512)
pred_gpt = gpt_model(df , model_and_tokenizer_path = r'models_and_tokenizers\gpt_2', max_len = 512 )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 465ms/step


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Human : 0, LLM : 1

In [11]:
pred_cnn

array([[0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0]])

In [12]:
pred_roberta

[1, 0, 1, 0, 1, 0, 0]

In [13]:
pred_gpt

[1, 0, 0, 0, 1, 0, 0]

In [14]:
final_pred = []
for i in range(len(pred_gpt)):
    lst = [pred_cnn[i][0],pred_roberta[i],pred_gpt[i]]
    final_pred.append(max(lst, key=lst.count))

In [15]:
df['label']

0    1
1    0
2    1
3    0
4    1
5    1
6    0
Name: label, dtype: int64

In [16]:
final_pred

[1, 0, 1, 0, 1, 0, 0]

In [17]:
accuracy_final = accuracy_score(df['label'].values, final_pred)

In [18]:
accuracy_final

0.8571428571428571