In [1]:
import matplotlib.pyplot as plt
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import string
from collections import Counter
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ruslanishakov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def tokenize_and_clean_text(text):
    tokens = tokenize.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    punct_chars = string.punctuation + "'s" + '""' + '...' + "''" + '``'
    filtered_tokens = [word.lower() for word in tokens if word not in stop_words and word not in punct_chars]
    return filtered_tokens

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [3]:
pickl_h_train = pd.read_pickle('dataset/outfox/human/train_humans.pkl')
pickl_h_test = pd.read_pickle('dataset/outfox/human/test_humans.pkl')
pickl_h_valid = pd.read_pickle('dataset/outfox/human/valid_humans.pkl')
df_human_test = pd.DataFrame(pickl_h_test)
df_human_train = pd.DataFrame(pickl_h_train)
df_human_valid = pd.DataFrame(pickl_h_valid)
df_human = pd.concat([df_human_train, df_human_test, df_human_valid], ignore_index=True)
df_human.rename(columns={0: 'text'}, inplace=True)
df_human['generated'] = 0
df_human

Unnamed: 0,text,generated
0,Driverless cars have always been seen and thou...,0
1,The Electoral College is only taking your pers...,0
2,Distance learning recently started being consi...,0
3,Distractions while driving could lead to death...,0
4,Would having car free cities be easier for eve...,0
...,...,...
15395,"Should students have classes from home, base i...",0
15396,Driveress cars are in our future because of al...,0
15397,Are dreiverless cars a good idea for the futur...,0
15398,Sometimes school can be to much for a person. ...,0


In [4]:
pickl_gpt_train = pd.read_pickle('dataset/outfox/gpt/train_lms.pkl')
pickl_gpt_test = pd.read_pickle('dataset/outfox/gpt/test_lms.pkl')
pickl_gpt_valid = pd.read_pickle('dataset/outfox/gpt/valid_lms.pkl')
df_gpt_train = pd.DataFrame(pickl_gpt_train)
df_gpt_test = pd.DataFrame(pickl_gpt_test)
df_gpt_valid = pd.DataFrame(pickl_gpt_valid)
df_gpt = pd.concat([df_gpt_train, df_gpt_test, df_gpt_valid], ignore_index=True)
df_gpt.rename(columns={0: 'text'}, inplace=True)
df_gpt['generated'] = 1
df_gpt

Unnamed: 0,text,generated
0,The world is rapidly advancing towards advance...,1
1,The Electoral College is a unique system that ...,1
2,The COVID-19 pandemic has negatively affected ...,1
3,"Driving is inherently risky, and the chances o...",1
4,The issue of car-free cities has gained more a...,1
...,...,...
15395,The idea of students taking classes from home ...,1
15396,"Over the past few years, the technology for dr...",1
15397,With technological advancements paving the way...,1
15398,The educational landscape has undergone a tran...,1


In [5]:
df = pd.concat([df_human, df_gpt], ignore_index=True)

In [6]:
df = df.sample(30800).reset_index(drop=True)

In [7]:
df = df.drop_duplicates()

In [8]:
df

Unnamed: 0,text,generated
0,"For centuries, we have followed a certian syst...",0
1,Online schooling has grown in popularity in re...,1
2,Driverless cars should not become a reality. ...,0
3,Summer assignments have become a staple of mos...,1
4,"In ""The Challenge of exploring Venus,"" the aut...",0
...,...,...
30795,The images of a human-like Face on Mars have p...,1
30796,"Emotions may be hard to understand sometimes, ...",0
30797,Engagement in extracurricular activities offer...,1
30798,Almost every single person that dives a vehicl...,0


In [9]:
X = df['text']
y = df['generated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
corpus_cleaned_lemm_train = []
corpus_cleaned_lemm_test = []
for text in X_train.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm_train.append(' '.join(text))
for text in X_test.apply(tokenize_and_clean_text).apply(lemmatize):
    corpus_cleaned_lemm_test.append(' '.join(text))

In [11]:
cnt_vec_uni = CountVectorizer(ngram_range=(1, 1))
cnt_vec_uni.fit(corpus_cleaned_lemm_train)
X_train_vec = cnt_vec_uni.transform(corpus_cleaned_lemm_train)
X_test_vec = cnt_vec_uni.transform(corpus_cleaned_lemm_test)

In [13]:
cnt_tfidf_uni = TfidfVectorizer(ngram_range=(1, 1), max_features=100)
cnt_tfidf_uni.fit(corpus_cleaned_lemm_train)
X_train_tf = cnt_tfidf_uni.transform(corpus_cleaned_lemm_train)
X_test_tf = cnt_tfidf_uni.transform(corpus_cleaned_lemm_test)

In [12]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred_train = model.predict(X_train_vec)
print(classification_report(y_train, y_pred_train))

y_pred_test = model.predict(X_test_vec)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12218
           1       1.00      1.00      1.00     12354

    accuracy                           1.00     24572
   macro avg       1.00      1.00      1.00     24572
weighted avg       1.00      1.00      1.00     24572

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3097
           1       1.00      1.00      1.00      3046

    accuracy                           1.00      6143
   macro avg       1.00      1.00      1.00      6143
weighted avg       1.00      1.00      1.00      6143



In [15]:
model_2 = LogisticRegression(max_iter=1000)
model_2.fit(X_train_tf, y_train)

y_pred_train_2 = model_2.predict(X_train_tf)
print(classification_report(y_train, y_pred_train))

y_pred_test_2 = model_2.predict(X_test_tf)
print(classification_report(y_test, y_pred_test_2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12218
           1       1.00      1.00      1.00     12354

    accuracy                           1.00     24572
   macro avg       1.00      1.00      1.00     24572
weighted avg       1.00      1.00      1.00     24572

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3097
           1       0.97      0.97      0.97      3046

    accuracy                           0.97      6143
   macro avg       0.97      0.97      0.97      6143
weighted avg       0.97      0.97      0.97      6143



### Bi- Trigrams

In [16]:
cnt_vec_bi = CountVectorizer(ngram_range=(2, 3))
cnt_vec_bi.fit(corpus_cleaned_lemm_train)
X_train_vec_2 = cnt_vec_bi.transform(corpus_cleaned_lemm_train)
X_test_vec_2 = cnt_vec_bi.transform(corpus_cleaned_lemm_test)

In [19]:
cnt_tfidf_bi = TfidfVectorizer(ngram_range=(2, 3), max_features=100)
cnt_tfidf_bi.fit(corpus_cleaned_lemm_train)
X_train_tf_2 = cnt_tfidf_bi.transform(corpus_cleaned_lemm_train)
X_test_tf_2 = cnt_tfidf_bi.transform(corpus_cleaned_lemm_test)

In [18]:
model_3 = LogisticRegression(max_iter=1000)
model_3.fit(X_train_vec_2, y_train)

y_pred_train_3 = model_3.predict(X_train_vec_2)
print(classification_report(y_train, y_pred_train_3))

y_pred_test_3 = model_3.predict(X_test_vec_2)
print(classification_report(y_test, y_pred_test_3))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12218
           1       1.00      1.00      1.00     12354

    accuracy                           1.00     24572
   macro avg       1.00      1.00      1.00     24572
weighted avg       1.00      1.00      1.00     24572

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      3097
           1       0.99      1.00      1.00      3046

    accuracy                           1.00      6143
   macro avg       1.00      1.00      1.00      6143
weighted avg       1.00      1.00      1.00      6143



In [21]:
model_4 = LogisticRegression(max_iter=1000)
model_4.fit(X_train_tf_2, y_train)

y_pred_train_4 = model_4.predict(X_train_tf_2)
print(classification_report(y_train, y_pred_train_4))

y_pred_test_4 = model_4.predict(X_test_tf_2)
print(classification_report(y_test, y_pred_test_4))

              precision    recall  f1-score   support

           0       0.85      0.78      0.81     12218
           1       0.80      0.86      0.83     12354

    accuracy                           0.82     24572
   macro avg       0.82      0.82      0.82     24572
weighted avg       0.82      0.82      0.82     24572

              precision    recall  f1-score   support

           0       0.84      0.76      0.80      3097
           1       0.78      0.86      0.82      3046

    accuracy                           0.81      6143
   macro avg       0.81      0.81      0.81      6143
weighted avg       0.81      0.81      0.81      6143



Pickle dump

In [22]:
model_filename = 'model_log_vec.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(model, model_file)

In [23]:
model_filename = 'model_log_tfidf.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(model_2, model_file)

In [24]:
vec_filename = 'count_vectorizer_uni.pkl'
with open(vec_filename, 'wb') as vec_file:
    pickle.dump(cnt_vec_uni, vec_file)

In [25]:
vec_filename = 'tfidf_vectorizer_uni.pkl'
with open(vec_filename, 'wb') as vec_file:
    pickle.dump(cnt_tfidf_uni, vec_file)

In [None]:
# model_filename = 'model.pkl'
# with open(model_filename, 'rb') as model_file:
#     loaded_model = pickle.load(model_file)

In [None]:
# vec_filename = 'vectorizer.pkl'
# with open(vec_filename, 'rb') as vec_file:
#     loaded_cnt_vec = pickle.load(vec_file)