In [34]:
import pandas as pd
import functools
import keyword

In [37]:
MIN_WORD_COUNT = 'Min Word Count'
MAX_WORD_COUNT = 'Max Word Count'

df200 = pd.read_csv('resources/essays200.csv').drop('Unnamed: 4', axis=1)
df200[MIN_WORD_COUNT] = 150
df200[MAX_WORD_COUNT] = 200
df500 = pd.read_csv('resources/essays500.csv').drop('Unnamed: 4', axis=1) 
df500[MIN_WORD_COUNT] = 500
df500[MAX_WORD_COUNT] = 600
df1000 = pd.read_csv('resources/essays1000.csv').drop('Unnamed: 4', axis=1)
df1000[MIN_WORD_COUNT] = 800
df1000[MAX_WORD_COUNT] = 1200

df = pd.concat([df200, df500, df1000])
df.columns = df.columns \
    .str.strip() \
    .str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('-','_') \
    .map(lambda x: 'x'+x if x in keyword.kwlist else x )
df.head(5)

  .str.replace('(', '') \
  .str.replace(')', '') \


Unnamed: 0,essay_title,human_written,ai_written,type,min_word_count,max_word_count
0,Pneumonia: Differential Diagnosis and Primary ...,Penetration of pathogens of pneumonia in the r...,Pneumonia is a common respiratory infection th...,Expository,150,200
1,Relevance and Significance of Communication Te...,The relevance and significance of communicatio...,Communication technology has become an integra...,Persuasive,150,200
2,Technological Objects and Their Capabilities,An innovative home system is one of the unique...,Technological objects have become an integral ...,Expository,150,200
3,Philosophy Teaching and Learning Motivation,Teaching and learning philosophy can be a chal...,Teaching and learning philosophy can be a chal...,Expository,150,200
4,Buddhism and Hinduism: Religious Differences,Buddhism and Hinduism have the same roots. Nev...,Buddhism and Hinduism are two major religions ...,Compare & Contrast,150,200


In [38]:
melted_df = df.melt(id_vars=['essay_title', 'type', 'min_word_count'], value_vars=['human_written', 'ai_written'], var_name='source', value_name='essay')

def convert_label(label):
    return 1 if label == 'human_written' else 0

melted_df['labels'] = melted_df['source'].apply(convert_label)

melted_df = melted_df.sort_values(by=['source', 'essay_title']).reset_index()
melted_df.shape

(150, 7)

In [42]:
train_df = pd.concat([melted_df[:60], melted_df[75:135]])
test_df = pd.concat([melted_df[60:75], melted_df[135:]])

shuffled_train_df = train_df.sample(frac=1).reset_index().drop(['index', 'level_0'], axis=1)
shuffled_test_df = test_df.sample(frac=1).reset_index().drop(['index', 'level_0'], axis=1)
shuffled_train_df.head(5)

Unnamed: 0,essay_title,type,min_word_count,source,essay,labels
0,Black Americans in the Revolutionary Era,Expository,800,human_written,The American Revolution is typically depicted ...,1
1,Legalization of Marijuana in the United States,Argumentative,150,ai_written,Legalization of marijuana in the United States...,0
2,Inclusive Education’ Benefits,Argumentative,800,ai_written,Inclusive education is a type of education sys...,0
3,Commercial Uses of Data Mining,Expository,800,human_written,Data mining is a computer-based classification...,1
4,Mathematics – Concept of Multiplication,Expository,800,human_written,"In understanding multiplication, it is essenti...",1


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

NGRAM_RANGE = (2, 7)

TOP_K = 15000

TOKEN_MODE = 'word'

MIN_DOCUMENT_FREQUENCY = 7

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """

    kwargs = {
            'ngram_range': NGRAM_RANGE, 
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE, 
            'stop_words': 'english',
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val



In [80]:
x_train, x_val = ngram_vectorize(shuffled_train_df['essay'], shuffled_train_df['labels'], shuffled_test_df['essay'])

x_val.shape
x_val.toarray()
vectorized_train_df = pd.DataFrame(x_train.toarray())
vectorized_val_df = pd.DataFrame(x_val.toarray())

vectorized_df = pd.concat([vectorized_train_df, vectorized_val_df])

for i in range(len(vectorized_df.columns)):
    vectorized_df.rename(columns={vectorized_df.columns[i]: f"feature_{i}"}, inplace=True)
vectorized_df.rename({})



Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,1.0
2,0.0,0.5,0.000000,0.0,0.5,0.0,0.0,0.0,0.5,0.000000,0.0,0.0,0.000000,0.5,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
26,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0
27,0.0,0.0,0.054639,0.0,0.0,0.0,0.0,0.0,0.0,0.671668,0.0,0.0,0.738835,0.0,0.0
28,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0


In [81]:
# drop all columns with all zeros
vectorized_df = vectorized_df.loc[:, (vectorized_df != 0).any(axis=0)]

shuffled_df = pd.concat([shuffled_train_df, shuffled_test_df])

finalized_features_df = pd.concat([shuffled_df, vectorized_df], axis=1)

finalized_features_df.head(5)

Unnamed: 0,essay_title,type,min_word_count,source,essay,labels,feature_0,feature_1,feature_2,feature_3,...,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14
0,Black Americans in the Revolutionary Era,Expository,800,human_written,The American Revolution is typically depicted ...,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Legalization of Marijuana in the United States,Argumentative,150,ai_written,Legalization of marijuana in the United States...,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Inclusive Education’ Benefits,Argumentative,800,ai_written,Inclusive education is a type of education sys...,0,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0
3,Commercial Uses of Data Mining,Expository,800,human_written,Data mining is a computer-based classification...,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Mathematics – Concept of Multiplication,Expository,800,human_written,"In understanding multiplication, it is essenti...",1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
# sort by source
finalized_features_df = finalized_features_df.sort_values(by=['source', 'essay_title']).reset_index().drop(['index', 'labels'], axis=1)
finalized_features_df.head(5)

Unnamed: 0,essay_title,type,min_word_count,source,essay,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14
0,Adaptive Immunity: T-Cells and B-Cells,Argumentative,150,ai_written,Adaptive immunity is a type of immune response...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Advertising Ethics in the Modern Consumer Soci...,Argumentative,800,ai_written,Advertising is an essential aspect of modern c...,0.0,0.0,0.26975,0.321088,0.331598,0.303183,0.321088,0.0,0.0,0.663195,0.281665,0.0,0.0,0.0,0.0
2,American Health Care System,Expository,800,ai_written,The American health care system is a complex a...,0.0,0.0,0.0,0.0,0.0,0.25634,0.0,0.0,0.0,0.0,0.476292,0.0,0.0,0.0,0.841092
3,Arranged Marriage and Its Ethical Dilemma,Argumentative,800,ai_written,Arranged marriage has been a longstanding prac...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Assistive Technology for Kids with Learning Di...,Expository,800,ai_written,Children with learning disabilities often face...,0.0,0.0,0.631054,0.0,0.775739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Naive Bayes Classifier

In [94]:
class NaiveBayes:
    def __init__(self, df, class_col):
        #do something
        #you *may* want to cache every possible query for cond_prof. Up to you.
        self.df = df
        self.class_col = class_col

    # ex: cond_prof('cap_color','r','e') => P(cap_color==red|edible)
    @functools.lru_cache(maxsize=2048)
    def cond_prob(self, feature_col, feature_value, class_value):
        #  return P(feature_col==feature_value | class_col==class_value)
        feature = self.df[feature_col] == feature_value
        class_for_prob = self.df[self.class_col] == class_value
        return len(self.df[feature & class_for_prob]) / len(self.df[class_for_prob])

    # P(everthing | p)
    @functools.lru_cache(maxsize=2048)
    def conditional_term(self, essay, class_val):
        conditional_prob = 1
        for column in self.df.columns:
#             print(column)
            if column != self.class_col:
                col_val = getattr(essay, column)
                conditional_prob *= self.cond_prob(column, col_val, class_val)
        return conditional_prob

    # P(class_value) alone
    @functools.lru_cache(maxsize=2048)
    def prior(self, class_value):
        return len(self.df[self.df[self.class_col] == class_value]) / len(self.df)
  
    #mushroom is stuff you got from itertuple
    # return P(edible | all mushroom features)
    def prob_ai(self, essay):
        ai_written_conditional = self.conditional_term(essay, 'ai_written')
        human_written_conditional = self.conditional_term(essay, 'human_written')
        ai_written_prior = self.prior('ai_written') # P(AW)
        human_written_prior = self.prior('human_written') # P(HW)
        evidence = (ai_written_conditional * ai_written_prior) + (human_written_conditional * human_written_prior)
        return (ai_written_conditional * ai_written_prior) / evidence

In [95]:
classifier = NaiveBayes(finalized_features_df, 'source')

In [101]:
ai_cond = 1e-11
human = 0
ai = 0
for i in finalized_features_df.itertuples():
    prob_ai = classifier.conditional_term(i, 'ai_written')
    if (prob_ai >= ai_cond and i.source == 'ai_written'):
        ai += 1
    elif (prob_ai < ai_cond and i.source == 'human_written'):
        human += 1
        
human + ai

139

In [102]:
139 / 150

0.9266666666666666