In [1]:
import matplotlib.pyplot as plt
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import string
from collections import Counter
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
pickl_h_train = pd.read_pickle('dataset/outfox/human/train_humans.pkl')
pickl_h_test = pd.read_pickle('dataset/outfox/human/test_humans.pkl')
pickl_h_valid = pd.read_pickle('dataset/outfox/human/valid_humans.pkl')
df_human_test = pd.DataFrame(pickl_h_test)
df_human_train = pd.DataFrame(pickl_h_train)
df_human_valid = pd.DataFrame(pickl_h_valid)
df_human = pd.concat([df_human_train, df_human_test, df_human_valid], ignore_index=True)
df_human.rename(columns={0: 'text'}, inplace=True)
df_human['generated'] = 0
df_human

Unnamed: 0,text,generated
0,Driverless cars have always been seen and thou...,0
1,The Electoral College is only taking your pers...,0
2,Distance learning recently started being consi...,0
3,Distractions while driving could lead to death...,0
4,Would having car free cities be easier for eve...,0
...,...,...
15395,"Should students have classes from home, base i...",0
15396,Driveress cars are in our future because of al...,0
15397,Are dreiverless cars a good idea for the futur...,0
15398,Sometimes school can be to much for a person. ...,0


In [3]:
pickl_davinci_train = pd.read_pickle('dataset/outfox/davinci/train_lms_davinci.pkl')
pickl_davinci_test = pd.read_pickle('dataset/outfox/davinci/test_lms_davinci.pkl')
pickl_davinci_valid = pd.read_pickle('dataset/outfox/davinci/valid_lms_davinci.pkl')
df_davinci_train = pd.DataFrame(pickl_davinci_train)
df_davinci_test = pd.DataFrame(pickl_davinci_test)
df_davinci_valid = pd.DataFrame(pickl_davinci_valid)
df_davinci = pd.concat([df_davinci_train, df_davinci_test, df_davinci_valid], ignore_index=True)
df_davinci.rename(columns={0: 'text'}, inplace=True)
df_davinci['generated'] = 1
df_davinci

Unnamed: 0,text,generated
0,\nDriverless cars have been heralded as a grou...,1
1,The Electoral College system is controversial ...,1
2,The global coronavirus pandemic has had signi...,1
3,Texting and driving is distracting and dangero...,1
4,The growth of urbanism and improved transporta...,1
...,...,...
15395,"In the digital age, the age-old debate of whet...",1
15396,Driverless cars could dramatically reduce the ...,1
15397,Driverless cars could revolutionize contempora...,1
15398,Going to school can be a bit of a bummer. Writ...,1


In [4]:
df = pd.concat([df_human, df_davinci], ignore_index=True)

In [5]:
df = df.sample(30800).reset_index(drop=True)
df.drop_duplicates()

Unnamed: 0,text,generated
0,It is a better choice asking more than one per...,0
1,\nMeaningful dialogue between educators and st...,1
2,The importance of the Electoral College in the...,1
3,\nParticipating in the Seagoing Cowboys progra...,1
4,I overheard that you are thinking on should we...,0
...,...,...
30795,"\nIn recent years, mandating students to take ...",1
30796,Choices are the essence of experiencing life t...,1
30797,Making decisions can be one of the most diffic...,1
30798,Many Americans enjoy technologies advancement ...,0


In [6]:
def tokenize_and_clean_text(text):
    tokens = tokenize.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    punct_chars = string.punctuation + "'s" + '""' + '...' + "''" + '``'
    filtered_tokens = [word.lower() for word in tokens if word not in stop_words and word not in punct_chars]
    return filtered_tokens

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [7]:
X = df['text']
y = df['generated']

In [8]:
corpus_cleaned_lemm = []
for text in X.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm.append(' '.join(text))

Load models and vectorizers

In [9]:
model_filename = 'model_log_vec.pkl'
with open(model_filename, 'rb') as model_file:
    loaded_model_vec = pickle.load(model_file)

In [10]:
model_filename = 'model_log_tfidf.pkl'
with open(model_filename, 'rb') as model_file:
    loaded_model_tfidf = pickle.load(model_file)

In [11]:
vec_filename = 'count_vectorizer_uni.pkl'
with open(vec_filename, 'rb') as vec_file:
    loaded_cnt_vec = pickle.load(vec_file)

In [12]:
vec_filename = 'tfidf_vectorizer_uni.pkl'
with open(vec_filename, 'rb') as vec_file:
    loaded_tfidf_vec = pickle.load(vec_file)

### LogReg Models based on CountVectorizer and TfIdfVectorizer respectively, trained on 30800 essays from ChatGPT-Human dataset

In [13]:
X_vec = loaded_cnt_vec.transform(corpus_cleaned_lemm)
pred_vec = loaded_model_vec.predict(X_vec)
print(print(classification_report(y, pred_vec)))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     15400
           1       1.00      0.99      1.00     15400

    accuracy                           1.00     30800
   macro avg       1.00      1.00      1.00     30800
weighted avg       1.00      1.00      1.00     30800

None


In [14]:
probs = loaded_model_vec.predict_proba(X_vec)
new_pred = np.where(probs[:, 1] > 0.4, 1, 0)
print(print(classification_report(y, new_pred)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15400
           1       1.00      1.00      1.00     15400

    accuracy                           1.00     30800
   macro avg       1.00      1.00      1.00     30800
weighted avg       1.00      1.00      1.00     30800

None


In [15]:
X_tfidf = loaded_tfidf_vec.transform(corpus_cleaned_lemm)
pred_tf = loaded_model_tfidf.predict(X_tfidf)
print(print(classification_report(y, pred_tf)))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     15400
           1       0.97      0.93      0.95     15400

    accuracy                           0.95     30800
   macro avg       0.95      0.95      0.95     30800
weighted avg       0.95      0.95      0.95     30800

None


In [20]:
probs_2 = loaded_model_tfidf.predict_proba(X_tfidf)
new_pred_2 = np.where(probs_2[:, 1] > 0.5, 1, 0)
print(print(classification_report(y, new_pred_2)))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     15400
           1       0.97      0.93      0.95     15400

    accuracy                           0.95     30800
   macro avg       0.95      0.95      0.95     30800
weighted avg       0.95      0.95      0.95     30800

None


Own models

In [21]:
X = df['text']
y = df['generated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
corpus_cleaned_lemm_train = []
corpus_cleaned_lemm_test = []
for text in X_train.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm_train.append(' '.join(text))
for text in X_test.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm_test.append(' '.join(text))

In [23]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_vectorizer.fit(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [24]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred_train = model.predict(X_train_tfidf)
print(classification_report(y_train, y_pred_train))
print()

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98     10817
           1       0.98      0.99      0.98     10743

    accuracy                           0.98     21560
   macro avg       0.98      0.98      0.98     21560
weighted avg       0.98      0.98      0.98     21560


              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4583
           1       0.98      0.98      0.98      4657

    accuracy                           0.98      9240
   macro avg       0.98      0.98      0.98      9240
weighted avg       0.98      0.98      0.98      9240



In [25]:
tfidf_vec_bi = TfidfVectorizer(ngram_range=(2, 2), max_features=100)
tfidf_vec_bi.fit(corpus_cleaned_lemm_train)
X_train_tfidf = tfidf_vec_bi.transform(corpus_cleaned_lemm_train)
X_test_tfidf = tfidf_vec_bi.transform(corpus_cleaned_lemm_test)

In [28]:
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(X_train_tfidf, y_train)

y_pred_train_2 = model_2.predict(X_train_tfidf)
print(classification_report(y_train, y_pred_train_2))

y_pred_test_2 = model_2.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test_2))

probs = model_2.predict_proba(X_test_tfidf)
pred_probs = np.where(probs[:, 1] > 0.4, 1, 0)
print(classification_report(y_test, pred_probs))

              precision    recall  f1-score   support

           0       0.85      0.78      0.82     10817
           1       0.80      0.86      0.83     10743

    accuracy                           0.82     21560
   macro avg       0.82      0.82      0.82     21560
weighted avg       0.82      0.82      0.82     21560

              precision    recall  f1-score   support

           0       0.85      0.77      0.81      4583
           1       0.79      0.87      0.83      4657

    accuracy                           0.82      9240
   macro avg       0.82      0.82      0.82      9240
weighted avg       0.82      0.82      0.82      9240

              precision    recall  f1-score   support

           0       0.89      0.70      0.78      4583
           1       0.75      0.91      0.83      4657

    accuracy                           0.81      9240
   macro avg       0.82      0.81      0.80      9240
weighted avg       0.82      0.81      0.80      9240

