In [1]:
import pandas as pd

In [2]:
MIN_WORD_COUNT = 'Min Word Count'
MAX_WORD_COUNT = 'Max Word Count'

df200 = pd.read_csv('essays200.csv').drop('Unnamed: 4', axis=1)
df200[MIN_WORD_COUNT] = 150
df200[MAX_WORD_COUNT] = 200
df500 = pd.read_csv('essays500.csv').drop('Unnamed: 4', axis=1) 
df500[MIN_WORD_COUNT] = 500
df500[MAX_WORD_COUNT] = 600
df1000 = pd.read_csv('essays1000.csv').drop('Unnamed: 4', axis=1)
df1000[MIN_WORD_COUNT] = 800
df1000[MAX_WORD_COUNT] = 1200

df = pd.concat([df200, df500, df1000])
df.head(5)

Unnamed: 0,Essay Title,Human-Written,AI-Written,Type,Min Word Count,Max Word Count
0,Pneumonia: Differential Diagnosis and Primary ...,Penetration of pathogens of pneumonia in the r...,Pneumonia is a common respiratory infection th...,Expository,150,200
1,Relevance and Significance of Communication Te...,The relevance and significance of communicatio...,Communication technology has become an integra...,Persuasive,150,200
2,Technological Objects and Their Capabilities,An innovative home system is one of the unique...,Technological objects have become an integral ...,Expository,150,200
3,Philosophy Teaching and Learning Motivation,Teaching and learning philosophy can be a chal...,Teaching and learning philosophy can be a chal...,Expository,150,200
4,Buddhism and Hinduism: Religious Differences,Buddhism and Hinduism have the same roots. Nev...,Buddhism and Hinduism are two major religions ...,Compare & Contrast,150,200


In [3]:
melted_df = df.melt(id_vars=['Essay Title', 'Type', 'Min Word Count'], value_vars=['Human-Written', 'AI-Written'], var_name='Source', value_name='Essay')

def convert_label(label):
    return 1 if label == 'Human-Written' else 0

melted_df['Labels'] = melted_df['Source'].apply(convert_label)

melted_df = melted_df.sort_values(by=['Source', 'Essay Title']).reset_index()
melted_df.shape

(150, 7)

In [4]:
train_df = pd.concat([melted_df[:60], melted_df[75:135]])
test_df = pd.concat([melted_df[60:75], melted_df[135:]])

shuffled_train_df = train_df.sample(frac=1).reset_index().drop('index', axis=1)
shuffled_test_df = test_df.sample(frac=1).reset_index().drop('index', axis=1)
shuffled_train_df.head(5)

Unnamed: 0,level_0,Essay Title,Type,Min Word Count,Source,Essay,Labels
0,1,Advertising Ethics in the Modern Consumer Soci...,Argumentative,800,AI-Written,Advertising is an essential aspect of modern c...,0
1,97,Game Store’s Marketing and Financial Plan,Expository,500,Human-Written,Game store is a business that specializes with...,1
2,51,Relevance and Significance of Communication Te...,Persuasive,150,AI-Written,Communication technology has become an integra...,0
3,116,Moral issue in business,Analytical,800,Human-Written,Privacy has been identified to be an integral ...,1
4,12,Comparison Between Islamic Story of Creation a...,Expository,500,AI-Written,The Islamic story of creation and the Big Bang...,0


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

NGRAM_RANGE = (2, 7)

TOP_K = 15000

TOKEN_MODE = 'word'

MIN_DOCUMENT_FREQUENCY = 7

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """

    kwargs = {
            'ngram_range': NGRAM_RANGE, 
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE, 
            'stop_words': 'english',
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val



In [6]:
x_train, x_val = ngram_vectorize(shuffled_train_df['Essay'], shuffled_train_df['Labels'], shuffled_test_df['Essay'])

x_val.shape
x_val.toarray()
vectorized_train_df = pd.DataFrame(x_train.toarray())
vectorized_val_df = pd.DataFrame(x_val.toarray())

vectorized_df = pd.concat([vectorized_train_df, vectorized_val_df])
vectorized_df.head(5), shuffled_train_df['Source'].head(5)

vectorized_df.shape



(150, 15)

In [7]:
# drop all columns with all zeros
vectorized_df = vectorized_df.loc[:, (vectorized_df != 0).any(axis=0)]

shuffled_df = pd.concat([shuffled_train_df, shuffled_test_df])

finalized_features_df = pd.concat([shuffled_df, vectorized_df], axis=1)

finalized_features_df.head(5)

Unnamed: 0,level_0,Essay Title,Type,Min Word Count,Source,Essay,Labels,0,1,2,...,5,6,7,8,9,10,11,12,13,14
0,1,Advertising Ethics in the Modern Consumer Soci...,Argumentative,800,AI-Written,Advertising is an essential aspect of modern c...,0,0.0,0.0,0.26975,...,0.303183,0.321088,0.0,0.0,0.663195,0.281665,0.0,0.0,0.0,0.0
1,97,Game Store’s Marketing and Financial Plan,Expository,500,Human-Written,Game store is a business that specializes with...,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,51,Relevance and Significance of Communication Te...,Persuasive,150,AI-Written,Communication technology has become an integra...,0,0.695631,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.718399,0.0,0.0
3,116,Moral issue in business,Analytical,800,Human-Written,Privacy has been identified to be an integral ...,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12,Comparison Between Islamic Story of Creation a...,Expository,500,AI-Written,The Islamic story of creation and the Big Bang...,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# sort by source
finalized_features_df = finalized_features_df.sort_values(by=['Source', 'Essay Title']).reset_index().drop(['index', 'Labels'], axis=1)
finalized_features_df.head(5)

Unnamed: 0,level_0,Essay Title,Type,Min Word Count,Source,Essay,0,1,2,3,...,5,6,7,8,9,10,11,12,13,14
0,0,Adaptive Immunity: T-Cells and B-Cells,Argumentative,150,AI-Written,Adaptive immunity is a type of immune response...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Advertising Ethics in the Modern Consumer Soci...,Argumentative,800,AI-Written,Advertising is an essential aspect of modern c...,0.0,0.0,0.26975,0.321088,...,0.303183,0.321088,0.0,0.0,0.663195,0.281665,0.0,0.0,0.0,0.0
2,2,American Health Care System,Expository,800,AI-Written,The American health care system is a complex a...,0.0,0.0,0.0,0.0,...,0.25634,0.0,0.0,0.0,0.0,0.476292,0.0,0.0,0.0,0.841092
3,3,Arranged Marriage and Its Ethical Dilemma,Argumentative,800,AI-Written,Arranged marriage has been a longstanding prac...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Assistive Technology for Kids with Learning Di...,Expository,800,AI-Written,Children with learning disabilities often face...,0.0,0.0,0.631054,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
finalized_features_df[25:].head(5)

Unnamed: 0,level_0,Essay Title,Type,Min Word Count,Source,Essay,0,1,2,3,...,5,6,7,8,9,10,11,12,13,14
25,25,How to Develop Self Confidence,Expository,500,AI-Written,Self-confidence is an essential trait that ena...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,26,Human Resource Development: Positive and Negat...,Personal,150,AI-Written,Human Resource Development (HRD) is a critical...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27,27,Improving the Management Problems Within the O...,Expository,800,AI-Written,Management problems can cause significant issu...,0.0,0.0,0.52128,0.620487,...,0.585887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,28,Inclusive Education’ Benefits,Argumentative,800,AI-Written,Inclusive education is a type of education sys...,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0
29,29,Injury Control: Enhancing Car Seat and Seatbel...,Expository,150,AI-Written,Injury control is an essential part of public ...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
