In [1]:
import pandas as pd
import functools
import keyword

In [2]:
MIN_WORD_COUNT = 'Min Word Count'
MAX_WORD_COUNT = 'Max Word Count'

df200 = pd.read_csv('resources/essays200.csv').drop('Unnamed: 4', axis=1)
df200[MIN_WORD_COUNT] = 150
df200[MAX_WORD_COUNT] = 200
df500 = pd.read_csv('resources/essays500.csv').drop('Unnamed: 4', axis=1) 
df500[MIN_WORD_COUNT] = 500
df500[MAX_WORD_COUNT] = 600
df1000 = pd.read_csv('resources/essays1000.csv').drop('Unnamed: 4', axis=1)
df1000[MIN_WORD_COUNT] = 800
df1000[MAX_WORD_COUNT] = 1200

df = pd.concat([df200, df500, df1000])
df.columns = df.columns \
    .str.strip() \
    .str.lower() \
    .str.replace(' ', '_') \
    .str.replace('(', '') \
    .str.replace(')', '') \
    .str.replace('-','_') \
    .map(lambda x: 'x'+x if x in keyword.kwlist else x )
df.head(5)

  df.columns = df.columns \
  df.columns = df.columns \


Unnamed: 0,essay_title,human_written,ai_written,type,min_word_count,max_word_count
0,Pneumonia: Differential Diagnosis and Primary ...,Penetration of pathogens of pneumonia in the r...,Pneumonia is a common respiratory infection th...,Expository,150,200
1,Relevance and Significance of Communication Te...,The relevance and significance of communicatio...,Communication technology has become an integra...,Persuasive,150,200
2,Technological Objects and Their Capabilities,An innovative home system is one of the unique...,Technological objects have become an integral ...,Expository,150,200
3,Philosophy Teaching and Learning Motivation,Teaching and learning philosophy can be a chal...,Teaching and learning philosophy can be a chal...,Expository,150,200
4,Buddhism and Hinduism: Religious Differences,Buddhism and Hinduism have the same roots. Nev...,Buddhism and Hinduism are two major religions ...,Compare & Contrast,150,200


In [3]:
melted_df = df.melt(id_vars=['essay_title', 'type', 'min_word_count'], value_vars=['human_written', 'ai_written'], var_name='source', value_name='essay')

def convert_label(label):
    return 1 if label == 'human_written' else 0

melted_df['labels'] = melted_df['source'].apply(convert_label)

melted_df = melted_df.sort_values(by=['source', 'essay_title']).reset_index()
melted_df.shape

(150, 7)

In [4]:
train_df = pd.concat([melted_df[:60], melted_df[75:135]])
test_df = pd.concat([melted_df[60:75], melted_df[135:]])

shuffled_train_df = train_df.sample(frac=1).reset_index().drop(['index', 'level_0'], axis=1)
shuffled_test_df = test_df.sample(frac=1).reset_index().drop(['index', 'level_0'], axis=1)
shuffled_train_df.head(5)

Unnamed: 0,essay_title,type,min_word_count,source,essay,labels
0,Blanchard and Fiedler Leadership Models,Expository,500,human_written,As opposed to Taylorists who opined that there...,1
1,Energy and Environmental Policies,Expository,800,human_written,Laws are meant to regulate how people behave t...,1
2,Moral issue in business,Analytical,800,human_written,Privacy has been identified to be an integral ...,1
3,Relevance and Significance of Communication Te...,Persuasive,150,ai_written,Communication technology has become an integra...,0
4,Concepts of the Baroque Era,Expository,800,human_written,The Baroque era was a period in the art histor...,1


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

NGRAM_RANGE = (2, 3)

TOP_K = 25000

TOKEN_MODE = 'word'

MIN_DOCUMENT_FREQUENCY = 4

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """

    kwargs = {
            'ngram_range': NGRAM_RANGE, 
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE, 
            'stop_words': 'english',
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val



In [6]:
x_train, x_val = ngram_vectorize(shuffled_train_df['essay'], shuffled_train_df['labels'], shuffled_test_df['essay'])

def to_vector(x):
    df = pd.DataFrame(x.toarray())
    return df.loc[:, (df != 0).any(axis=0)]

vectorized_train_df = to_vector(x_train)
vectorized_test_df = to_vector(x_val)

intersecting_columns = set(vectorized_train_df.columns) & set(vectorized_test_df.columns)
vectorized_train_df = vectorized_train_df[intersecting_columns]
vectorized_test_df = vectorized_test_df[intersecting_columns]

print(len(vectorized_train_df.columns))

# len(vectorized_test_df.columns)

def concatenate_and_cleanup(shuffled_df, vectorized_df):
    vectorized_df = vectorized_df.reindex(sorted(vectorized_df.columns), axis=1)

    for i in range(len(vectorized_df.columns)):
        vectorized_df.rename(columns={vectorized_df.columns[i]: f"feature_{i}"}, inplace=True)
    
    df = pd.concat([shuffled_df, vectorized_df], axis=1)

    return df.sort_values(by=['source', 'essay_title']).reset_index().drop(['index', 'labels'], axis=1)

train_df_with_features = concatenate_and_cleanup(shuffled_train_df, vectorized_train_df)
test_df_with_features =  concatenate_and_cleanup(shuffled_test_df, vectorized_test_df)




46


  vectorized_train_df = vectorized_train_df[intersecting_columns]
  vectorized_test_df = vectorized_test_df[intersecting_columns]


## Naive Bayes Classifier

In [9]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB

essay_types = {
    'Analytical': 0,
    'Compare & Contrast': 1,
    'Personal': 2,
    'Persuasive': 3,
    'Argumentative': 4,
    'Expository': 5,
}

def drop_non_features(df):
    df = df.copy()
    df['type'] = df['type'].apply(lambda x: essay_types[x])
    return df.drop(['essay_title', 'essay', 'source'], axis=1)


X_train, y_train = drop_non_features(train_df_with_features), train_df_with_features['source'] == 'ai_written'

# y_train
X_test, y_test = drop_non_features(test_df_with_features), test_df_with_features['source'] == 'ai_written'
X_train.head()

Unnamed: 0,type,min_word_count,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45
0,4,150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.364311,0.0,0.154726,0.0,0.0,0.205192,0.0,0.0
2,5,800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25798,0.171062,0.0,0.0,0.0,0.455571
3,4,800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.257736,0.0,0.0,0.515472,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

count_correct = 0

for pred, corr in zip(y_pred, y_test):
    if pred == corr:
        count_correct += 1

count_correct / len(y_test)

0.8333333333333334