In [8]:
import numpy as np
import pandas as pd
import praw
import time
from pmaw import PushshiftAPI
import openai
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from statistics import mean

In [3]:
df = pd.read_csv("combined_data_for_use.csv", index_col=0)

In [5]:
df.sample(n=20)

Unnamed: 0,question,ai_answer,human_answer
4402,Would you leave a high paying job because you ...,\n\nThere is no definitive answer to this ques...,"The truth is that a job is not just a job, it’..."
2354,What exactly are magnetic lines of flux?,\n\nA magnetic line of flux is a line of force...,Those lines you see are the result of several ...
200,"$74K salary as a degree-less 25-year-old, is i...","\n\nThere is no definitive answer, as the sala...",You finished 3 years of engineering? That’s th...
4239,Has Anyone Else Felt Like This?,\n\nI feel like this a lot.,I felt exactly the same way about 6 months ago...
2149,Does normal metabolic function eventually retu...,\n\nThere is no one answer to this question as...,Part of the problem is the adipose cells. When...
864,"After 8 months, I have finally landed a SWE work.","\n\nAfter 8 months of hard work, I have finall...",The amazing thing about this career is your fi...
518,Who has landed a position in 2023?,\n\nThe person who has landed a position in 20...,1.) 2.5yoe\n2.) 4 offers within 2 months. Firs...
841,Would you spend your own money to get a good h...,\n\nI do not know if I would spend my own mone...,[deleted]
963,Does this feedback mean I’m a bad developer?,\n\nIf you receive feedback that you are not a...,I've worked with a new hire who received a lot...
2183,Why does hot air cool?,\n\nHot air cools because it loses energy as i...,Thermal cameras aren't that great at accuratel...


## EDA and pre-processing

In [7]:
#how many questions are questions?
count_q = 0
for t in df["question"]:
    if "?" in t:
        count_q +=1
print(count_q)
print(count_q/len(df))

4877
0.8830345826543545


In [13]:
#how many words on average are in the responses (ai vs human)
count_words = [len(nltk.word_tokenize(p)) for p in df["ai_answer"]]
mean(count_words)

45.58917255114974

In [27]:
mean(df['ai_answer'].str.len())

234.3706319029513

In [28]:
mean(df['human_answer'].str.len())

673.4162592793772

In [14]:
count_words = [len(nltk.word_tokenize(p)) for p in df["human_answer"]]
mean(count_words)

130.36212203512585

In [29]:
# remove rows with 1 word responses: [deleted] , [removed]
# may want to research why these showed up as top comments

In [56]:
df_filtered_ = df[df["human_answer"] != "[deleted]"]
df_filtered_ = df[df["human_answer"] != "[removed]"]

In [57]:
df_filtered_["question"] = [t if "?" in t else 0 for t in df_filtered_["question"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_["question"] = [t if "?" in t else 0 for t in df_filtered_["question"]]


In [58]:
df_filtered_["is_human"] = [0 if i != "" else 1 for i in df_filtered_["human_answer"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_["is_human"] = [0 if i != "" else 1 for i in df_filtered_["human_answer"]]


In [59]:
df_filtered_["is_ai"] = [1 if i != "" else 0 for i in df_filtered_["ai_answer"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_["is_ai"] = [1 if i != "" else 0 for i in df_filtered_["ai_answer"]]


In [60]:
df_filtered_["question_dubs"] = df_filtered_["question"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_["question_dubs"] = df_filtered_["question"]


In [61]:
df_filtered_.head()

Unnamed: 0,question,ai_answer,human_answer,is_human,is_ai,question_dubs
0,Redditors who make +$100K and aren’t being kil...,\n\nSome Redditors who make +$100K and aren’t ...,If that’s important to you then not working at...,0,1,Redditors who make +$100K and aren’t being kil...
1,Quitting after 8 years without notice. Am I wr...,"\n\nYes, you are wrong. You are allowed to qui...",You are not wrong.\nLoyalty is a two-way stree...,0,1,Quitting after 8 years without notice. Am I wr...
2,My boss is pressuring to tell him where my new...,\n\nIf you are in a position of authority and ...,No. Do not say anything to him. Managers are n...,0,1,My boss is pressuring to tell him where my new...
3,"BS’ed my way into a 160K job offer, am I crazy...",\n\nThere is no definitive answer to this ques...,Literally every single person I've known who e...,0,1,"BS’ed my way into a 160K job offer, am I crazy..."
4,My boss confronted me about only working 7 hou...,\n\nIf your boss confronts you about working o...,Work an extra 15 minutes and tell him you didn...,0,1,My boss confronted me about only working 7 hou...


In [62]:
df_consolidated_ = df_filtered_[['question', 'human_answer', "is_human"]].append(df_filtered_[[
    'question_dubs','ai_answer',"is_ai"]].set_axis(['question', 'human_answer',"is_human"], axis=1)).reset_index(drop=True)

  df_consolidated_ = df_filtered_[['question', 'human_answer', "is_human"]].append(df_filtered_[[


In [66]:
df_consolidated_.rename(columns={"human_answer": "answer"}, inplace=True)

In [67]:
#remove phrases that would give away AI answer
df_consolidated_.loc[df_consolidated_["is_human"] == 1, "answer"] = df_consolidated_["answer"].str.replace("I am only a machine", "")
df_consolidated_.loc[df_consolidated_["is_human"] == 1, "answer"] = df_consolidated_["answer"].str.replace("As an AI language model", "")

In [68]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import nltk

 
nltk_stop = stopwords.words('english')

In [69]:
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

In [70]:
#split answers into tokens(words)
df_consolidated_splits = df_consolidated_["answer"].apply(nltk.word_tokenize)

In [71]:
#lemmatize and rejoin tokens for each answer
for i in range(len(df_consolidated_splits)):
    df_consolidated_splits[i] = ' '.join([lemmatizer.lemmatize(w) for w in df_consolidated_splits[i]])

In [72]:
df_consolidated_splits.head()

0    If that ’ s important to you then not working ...
1    You are not wrong . Loyalty is a two-way stree...
2    No . Do not say anything to him . Managers are...
3    Literally every single person I 've known who ...
4    Work an extra 15 minute and tell him you didn ...
Name: answer, dtype: object

In [74]:
df_consolidated_["answer"] = df_consolidated_splits

In [77]:
X = df_consolidated_["answer"]
y = df_consolidated_["is_human"]

In [78]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42)

In [79]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('bnb', BernoulliNB())
])

In [80]:
pipe_params = {
    'cvec__max_features' : [5000],
    'cvec__min_df' : [2,5],
    'cvec__max_df' : [0.9, 0.95],
    'cvec__ngram_range' : [(1,1), (1, 2), (2,2)],
    'cvec__stop_words' : ['english', nltk_stop],
}

In [81]:
gs = GridSearchCV(pipe, 
                  pipe_params, 
                  cv = 5)

In [82]:
gs.fit(X_train, y_train)

In [83]:
gs.best_score_

0.7910725356478612

In [84]:
print(gs.score(X_train,y_train))
print(gs.score(X_test,y_test))


0.8040917544947304
0.7958348828560803


In [85]:
pipe_params_tvec = {
    'tvec__max_features' : [5000],
    'tvec__min_df' : [2,5],
    'tvec__max_df' : [0.9, 0.95],
    'tvec__ngram_range' : [(1,1), (1, 2), (2,2)],
    'tvec__stop_words' : ['english', nltk_stop]
}

In [86]:
pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('bnb', BernoulliNB())
])

In [88]:
gs_tvec = GridSearchCV(pipe_tvec, 
                  pipe_params_tvec, 
                  cv = 5)

In [89]:
gs_tvec.fit(X_train, y_train)

In [90]:
gs_tvec.best_score_

0.7910725356478612

In [91]:
print(gs_tvec.score(X_train,y_train))
print(gs_tvec.score(X_test,y_test))

0.8040917544947304
0.7958348828560803
