In [1]:
import matplotlib.pyplot as plt
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import string
from collections import Counter
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
df_h = pd.read_csv('dataset/llm/train_essays.csv')
df_palm = pd.read_csv('dataset/llm/LLM_generated_essay_PaLM.csv')
df = pd.concat([df_h, df_palm], ignore_index=True)
df['generated'] = df['generated'].astype('int')
df['prompt_id'] = df['prompt_id'].astype('int')
df = df.sample(2762)
df.reset_index(drop=True, inplace=True)
df.drop('id', axis=1, inplace=True)

In [3]:
df['prompt_id'].value_counts()

0    1420
1    1342
Name: prompt_id, dtype: int64

In [4]:
df

Unnamed: 0,prompt_id,text,generated
0,0,Cars have always been used to get from point A...,0
1,0,The United States is a car-centric country. In...,1
2,0,As the global concern for the environment incr...,0
3,1,Voting time is here again and its time to cast...,0
4,0,"Cars are a very, very common mode of transport...",0
...,...,...,...
2757,0,Cars are starting to become more and more expe...,0
2758,0,The advantages of limiting car usage has erupt...,0
2759,1,"Dear Senator, I feel that we should change usi...",0
2760,1,"Dear Senator,\n\nI am writing to you today to ...",1


In [5]:
def tokenize_and_clean_text(text):
    tokens = tokenize.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    punct_chars = string.punctuation + "'s" + '""' + '...' + "''" + '``'
    filtered_tokens = [word.lower() for word in tokens if word not in stop_words and word not in punct_chars]
    return filtered_tokens

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

### prompt_id 0 - домен авто/города (car-free cities)
### prompt_id 1 - домен политика США + письмо (write a letter to your senator about electoral college)

In [6]:
X = df['text']
y = df['generated']

In [7]:
corpus_cleaned_lemm = []
for text in X.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm.append(' '.join(text))

Load models and vectorizers

In [8]:
model_filename = 'model_log_vec.pkl'
with open(model_filename, 'rb') as model_file:
    loaded_model_vec = pickle.load(model_file)

In [9]:
model_filename = 'model_log_tfidf.pkl'
with open(model_filename, 'rb') as model_file:
    loaded_model_tfidf = pickle.load(model_file)

In [10]:
vec_filename = 'count_vectorizer_uni.pkl'
with open(vec_filename, 'rb') as vec_file:
    loaded_cnt_vec = pickle.load(vec_file)

In [11]:
vec_filename = 'tfidf_vectorizer_uni.pkl'
with open(vec_filename, 'rb') as vec_file:
    loaded_tfidf_vec = pickle.load(vec_file)

### LogReg Models based on CountVectorizer and TfIdfVectorizer respectively, trained on 30800 essays from ChatGPT-Human dataset

In [12]:
X_vec = loaded_cnt_vec.transform(corpus_cleaned_lemm)
pred_vec = loaded_model_vec.predict(X_vec)
print(print(classification_report(y, pred_vec)))

              precision    recall  f1-score   support

           0       0.53      1.00      0.70      1375
           1       0.99      0.14      0.24      1387

    accuracy                           0.57      2762
   macro avg       0.76      0.57      0.47      2762
weighted avg       0.77      0.57      0.47      2762

None


In [13]:
X_tfidf = loaded_tfidf_vec.transform(corpus_cleaned_lemm)
pred_tf = loaded_model_tfidf.predict(X_tfidf)
print(print(classification_report(y, pred_tf)))

              precision    recall  f1-score   support

           0       0.52      0.94      0.67      1375
           1       0.70      0.13      0.22      1387

    accuracy                           0.54      2762
   macro avg       0.61      0.54      0.44      2762
weighted avg       0.61      0.54      0.44      2762

None


### Own models

In [14]:
X = df['text']
y = df['generated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [15]:
corpus_cleaned_lemm_train = []
corpus_cleaned_lemm_test = []
for text in X_train.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm_train.append(' '.join(text))
for text in X_test.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm_test.append(' '.join(text))

In [16]:
tfidf_vec = TfidfVectorizer(ngram_range=(1, 1), max_features=100)
tfidf_vec.fit(corpus_cleaned_lemm_train)
X_train_vec = tfidf_vec.transform(corpus_cleaned_lemm_train)
X_test_vec = tfidf_vec.transform(corpus_cleaned_lemm_test)

In [17]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred_train = model.predict(X_train_vec)
print(classification_report(y_train, y_pred_train))

y_pred_test = model.predict(X_test_vec)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1046
           1       1.00      1.00      1.00      1025

    accuracy                           1.00      2071
   macro avg       1.00      1.00      1.00      2071
weighted avg       1.00      1.00      1.00      2071

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       329
           1       1.00      1.00      1.00       362

    accuracy                           1.00       691
   macro avg       1.00      1.00      1.00       691
weighted avg       1.00      1.00      1.00       691



In [None]:
# model_filename = 'model.pkl'
# with open(model_filename, 'wb') as model_file:
#     pickle.dump(model, model_file)

In [None]:
# vec_filename = 'vectorizer.pkl'
# with open(vec_filename, 'wb') as vec_file:
#     pickle.dump(cnt_vec, vec_file)