In [36]:
import matplotlib.pyplot as plt
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, recall_score
import string
from collections import Counter
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
pickl_h_train = pd.read_pickle('dataset/outfox/human/train_humans.pkl')
pickl_h_test = pd.read_pickle('dataset/outfox/human/test_humans.pkl')
pickl_h_valid = pd.read_pickle('dataset/outfox/human/valid_humans.pkl')
df_human_test = pd.DataFrame(pickl_h_test)
df_human_train = pd.DataFrame(pickl_h_train)
df_human_valid = pd.DataFrame(pickl_h_valid)
df_human = pd.concat([df_human_train, df_human_test, df_human_valid], ignore_index=True)
df_human.rename(columns={0: 'text'}, inplace=True)
df_human['generated'] = 0
df_human

Unnamed: 0,text,generated
0,Driverless cars have always been seen and thou...,0
1,The Electoral College is only taking your pers...,0
2,Distance learning recently started being consi...,0
3,Distractions while driving could lead to death...,0
4,Would having car free cities be easier for eve...,0
...,...,...
15395,"Should students have classes from home, base i...",0
15396,Driveress cars are in our future because of al...,0
15397,Are dreiverless cars a good idea for the futur...,0
15398,Sometimes school can be to much for a person. ...,0


In [3]:
pickl_flan_train = pd.read_pickle('dataset/outfox/flan_t5/train_lms_flan.pkl')
pickl_flan_test = pd.read_pickle('dataset/outfox/flan_t5/test_lms_flan.pkl')
pickl_flan_valid = pd.read_pickle('dataset/outfox/flan_t5/valid_lms_flan.pkl')
df_flan_train = pd.DataFrame(pickl_flan_train)
df_flan_test = pd.DataFrame(pickl_flan_test)
df_flan_valid = pd.DataFrame(pickl_flan_valid)
df_flan = pd.concat([df_flan_train, df_flan_test, df_flan_valid], ignore_index=True)
df_flan.rename(columns={0: 'text'}, inplace=True)
df_flan['generated'] = 1
df_flan

Unnamed: 0,text,generated
0,Driverless motoring seems to be coming all of ...,1
1,Several factors that contributed to its declin...,1
2,The most obvious benefit of distance learning ...,1
3,There are millions every quarter that decide o...,1
4,Many cities offer traffic bans during differen...,1
...,...,...
15395,It is hard for students to follow their class ...,1
15396,This kind of software and technology used in c...,1
15397,Driverless cars would drastically reduce the a...,1
15398,Online classes require the use of computer and...,1


In [4]:
df = pd.concat([df_human, df_flan], ignore_index=True)

In [5]:
df = df.sample(30800).reset_index(drop=True)
df.drop_duplicates()

Unnamed: 0,text,generated
0,The need for students to complete a set period...,1
1,It has been concluded that most extracurricula...,1
2,In the modern days so much technology is being...,0
3,Luke Bomberger was a seagoing cowboy. He cross...,0
4,We should be changing to the election by popul...,0
...,...,...
30795,The notion that cell phones should be used whi...,0
30796,Whether drivers should be allowed use their ph...,1
30797,The most frequent criticism of the Electoral C...,1
30798,Why I think that Luke should join the program....,0


In [6]:
def tokenize_and_clean_text(text):
    tokens = tokenize.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    punct_chars = string.punctuation + "'s" + '""' + '...' + "''" + '``'
    filtered_tokens = [word.lower() for word in tokens if word not in stop_words and word not in punct_chars]
    return filtered_tokens

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [15]:
X = df['text']
y = df['generated']

In [16]:
corpus_cleaned_lemm = []
for text in X.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm.append(' '.join(text))

Load models and vectorizers

In [11]:
model_filename = 'model_log_vec.pkl'
with open(model_filename, 'rb') as model_file:
    loaded_model_vec = pickle.load(model_file)

In [12]:
model_filename = 'model_log_tfidf.pkl'
with open(model_filename, 'rb') as model_file:
    loaded_model_tfidf = pickle.load(model_file)

In [13]:
vec_filename = 'count_vectorizer_uni.pkl'
with open(vec_filename, 'rb') as vec_file:
    loaded_cnt_vec = pickle.load(vec_file)

In [14]:
vec_filename = 'tfidf_vectorizer_uni.pkl'
with open(vec_filename, 'rb') as vec_file:
    loaded_tfidf_vec = pickle.load(vec_file)

### LogReg Models based on CountVectorizer and TfIdfVectorizer respectively, trained on 30800 essays from ChatGPT-Human dataset

In [17]:
X_vec = loaded_cnt_vec.transform(corpus_cleaned_lemm)
pred_vec = loaded_model_vec.predict(X_vec)
print(print(classification_report(y, pred_vec)))

              precision    recall  f1-score   support

           0       0.61      1.00      0.76     15400
           1       1.00      0.36      0.53     15400

    accuracy                           0.68     30800
   macro avg       0.80      0.68      0.64     30800
weighted avg       0.80      0.68      0.64     30800

None


In [34]:
probs = loaded_model_vec.predict_proba(X_vec)
new_pred = np.where(probs[:, 1] > 0.01, 1, 0)
print(print(classification_report(y, new_pred)))

              precision    recall  f1-score   support

           0       0.74      0.98      0.84     15400
           1       0.97      0.65      0.78     15400

    accuracy                           0.82     30800
   macro avg       0.86      0.82      0.81     30800
weighted avg       0.86      0.82      0.81     30800

None


In [40]:
RecMax = -1
BestThr = -1
BestAcc = -1

for thr in np.arange(0, 1, 0.01):
    pred = np.where(probs[:, 1] > thr, 1, 0)
    rec = recall_score(y, pred)
    acc = accuracy_score(y, pred)
    if rec > RecMax and acc >= 0.6:
        RecMax = rec
        BestThr = thr
        BestAcc = accuracy_score(y, pred)
print(BestThr, RecMax, BestAcc)

0.01 0.6525324675324675 0.8176948051948052


In [22]:
X_tfidf = loaded_tfidf_vec.transform(corpus_cleaned_lemm)
pred_tf = loaded_model_tfidf.predict(X_tfidf)
print(print(classification_report(y, pred_tf)))

              precision    recall  f1-score   support

           0       0.61      0.97      0.75     15400
           1       0.92      0.38      0.54     15400

    accuracy                           0.67     30800
   macro avg       0.77      0.67      0.64     30800
weighted avg       0.77      0.67      0.64     30800

None


In [57]:
probs_2 = loaded_model_tfidf.predict_proba(X_tfidf)
new_pred_2 = np.where(probs_2[:, 1] > 0.15, 1, 0)
print(print(classification_report(y, new_pred_2)))

              precision    recall  f1-score   support

           0       0.70      0.85      0.77     15400
           1       0.81      0.64      0.72     15400

    accuracy                           0.75     30800
   macro avg       0.76      0.75      0.74     30800
weighted avg       0.76      0.75      0.74     30800

None


### Комментарий. Оптимальное соотношение recall и precision в виде максимума F1 достигается изменением порога классификации.

Own models

In [18]:
X = df['text']
y = df['generated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
corpus_cleaned_lemm_train = []
corpus_cleaned_lemm_test = []
for text in X_train.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm_train.append(' '.join(text))
for text in X_test.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm_test.append(' '.join(text))

In [19]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_vectorizer.fit(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [20]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred_train = model.predict(X_train_tfidf)
print(classification_report(y_train, y_pred_train))
print()

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91     10705
           1       0.91      0.91      0.91     10855

    accuracy                           0.91     21560
   macro avg       0.91      0.91      0.91     21560
weighted avg       0.91      0.91      0.91     21560


              precision    recall  f1-score   support

           0       0.92      0.89      0.91      4695
           1       0.89      0.92      0.91      4545

    accuracy                           0.91      9240
   macro avg       0.91      0.91      0.91      9240
weighted avg       0.91      0.91      0.91      9240



In [23]:
tfidf_vec_bi = TfidfVectorizer(ngram_range=(2, 2), max_features=100)
tfidf_vec_bi.fit(corpus_cleaned_lemm_train)
X_train_tfidf = tfidf_vec_bi.transform(corpus_cleaned_lemm_train)
X_test_tfidf = tfidf_vec_bi.transform(corpus_cleaned_lemm_test)

In [46]:
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(X_train_tfidf, y_train)

y_pred_train_2 = model_2.predict(X_train_tfidf)
print(classification_report(y_train, y_pred_train_2))

y_pred_test_2 = model_2.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test_2))

probs = model_2.predict_proba(X_test_tfidf)
pred_probs = np.where(probs[:, 1] > 0.45, 1, 0)
print(classification_report(y_test, pred_probs))

              precision    recall  f1-score   support

           0       0.76      0.75      0.75     10705
           1       0.75      0.76      0.76     10855

    accuracy                           0.75     21560
   macro avg       0.75      0.75      0.75     21560
weighted avg       0.75      0.75      0.75     21560

              precision    recall  f1-score   support

           0       0.77      0.74      0.75      4695
           1       0.74      0.77      0.75      4545

    accuracy                           0.75      9240
   macro avg       0.75      0.75      0.75      9240
weighted avg       0.75      0.75      0.75      9240

              precision    recall  f1-score   support

           0       0.80      0.68      0.74      4695
           1       0.71      0.83      0.77      4545

    accuracy                           0.75      9240
   macro avg       0.76      0.75      0.75      9240
weighted avg       0.76      0.75      0.75      9240

