# Authorship Attribution Petition

Sam McDowell \
12/5/2025

## Preprocessing

### Imports

In [2]:
%pip install spacy
import sys
!{sys.executable} -m spacy download en_core_web_sm

Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.3/12.8 MB 12.6 MB/s eta 0:00:01
     ------------ --------------------------- 3.9/12.8 MB 12.5 MB/s eta 0:00:01
     -------------------- ------------------- 6.6/12.8 MB 12.4 MB/s eta 0:00:01
     ----------------------------- ---------- 9.4/12.8 MB 13.0 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 13.7 MB/s  0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
%load_ext autoreload
%autoreload 1


%aimport features

import pandas as pd
import re
import html
import spacy

import features

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\sam_m\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sam_m\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load the Data

In [17]:
filepath = "./data/blog_authorship_corpus.csv"

df = pd.read_csv(filepath)

In [18]:
# drop unneeded columns
df = df.drop(["sign", "topic", "date", "gender", "age"], axis=1)

In [19]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,681284.0,,,,2397802.403396,1247722.701325,5114.0,1239610.0,2607577.0,3525660.0,4337650.0
text,681284.0,611652.0,urlLink,445.0,,,,,,,


### Select Source Texts

In [20]:
df["text_len"] = df["text"].str.len()
df["word_count"] = df["text"].str.split().str.len()

Find the authors with the most long posts, take their longest posts

In [21]:
# count number of posts longer than 1k words for each user
long_counts = df[df['word_count'] >= 1000].groupby('id').size().rename('long_post_count')

# group by id, filter more than 40 long posts
group_stats = df.groupby('id')["word_count"].agg(["mean","count"]).rename(columns={"mean":"avg_word_count","count":"item_count"})
group_stats = group_stats.join(long_counts, how='left').fillna(0).astype({'long_post_count': int})
group_stats = group_stats[group_stats['long_post_count'] >= 40]
group_stats = group_stats.sort_values("long_post_count", ascending=False)
group_stats.head(50)

# select only 15 authors
group_stats = group_stats.head(15)

In [22]:
# select top 40 longest posts from the 15 authors
top_posts = (
    df[df['id'].isin(group_stats.index)]
    .sort_values(['id', 'word_count'], ascending=[True, False])
    .groupby('id')
    .head(40)
    .reset_index(drop=True)
)

top_posts.head(5)

Unnamed: 0,id,text,text_len,word_count
0,215223,I Can See Clearly Now I went to...,11845,2177
1,215223,"Boozy Ol' Showgirl My Gawd, da...",10842,1951
2,215223,The Long Goodbye Over the weeke...,10177,1911
3,215223,Warning: Gay Homosexual Showtun...,9602,1703
4,215223,Internal Inventory I don't real...,8866,1630


In [23]:
top_posts.info()
top_posts.describe(include='all').T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          600 non-null    int64 
 1   text        600 non-null    object
 2   text_len    600 non-null    int64 
 3   word_count  600 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 18.9+ KB


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,600.0,,,,1630971.733333,960077.637048,215223.0,554681.0,1510754.0,2639424.0,2922061.0
text,600.0,600.0,[You say] the UN is a useless club f...,1.0,,,,,,,
text_len,600.0,,,,12457.49,14829.426587,5409.0,7203.0,8890.0,11906.5,274796.0
word_count,600.0,,,,2238.733333,2749.80065,1023.0,1303.5,1603.0,2148.75,51835.0


In [24]:
filepath = "./data/selected_texts.csv"
top_posts.to_csv(filepath)

### Preprocess Text

In [25]:
filepath = "./data/selected_texts.csv"
df = pd.read_csv(filepath, index_col=0)

In [26]:
# remove all characters except letters from each word
def keep_only_letters(s):
    tokens = []
    for w in s.split():
        # remove leading/trailing non-letters but keep internal characters (e.g. don't -> don't)
        w2 = re.sub(r'^[^A-Za-z]+|[^A-Za-z]+$', '', w)
        if re.search(r'[A-Za-z]', w2):
            tokens.append(w2)
    return " ".join(tokens)

# get number of sentences for each 
nlp = spacy.load("en_core_web_sm", disable=["tagger","parser","ner","lemmatizer"])
nlp.add_pipe("sentencizer")
def sentence_count(text):
    doc = nlp(text or "")
    return sum(1 for _ in doc.sents)


df["prp_text"] = df["text"].copy(deep=True)
# remove html special chars
df["prp_text"] = df["prp_text"].apply(lambda x: html.unescape(x)) 
# remove bad spaces
df["prp_text"] = df["prp_text"].apply(lambda x: x.replace("\u00A0", " ").replace("\xa0", " ")) 
# Remove 'urlLink
df["prp_text"] = df["text"].apply(lambda x: re.sub(r'urlLink', '', x).strip()) 
# remove non letter characters from words
df["prp_text"] = df["prp_text"].apply(keep_only_letters)
# Remove duplicate whitespace
df["prp_text"] = df["prp_text"].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# keep only first 1000 words
df["prp_text"] = df["prp_text"].apply(lambda x: " ".join(x.split()[0:1000]))

df["text_len"] = df["prp_text"].str.len()
df["word_count"] = df["prp_text"].str.split().str.len()
df['sentence_count'] = df['text'].fillna("").apply(sentence_count)

df.describe(include='all')

Unnamed: 0,id,text,text_len,word_count,prp_text,sentence_count
count,600.0,600,600.0,600.0,600,600.0
unique,,600,,,597,
top,,[You say] the UN is a useless club f...,,,Salon.com The confessions of a semi-successful...,
freq,,1,,,2,
mean,1630972.0,,5370.171667,1000.0,,146.433333
std,960077.6,,284.173027,0.0,,194.750795
min,215223.0,,4764.0,1000.0,,14.0
25%,554681.0,,5153.5,1000.0,,75.0
50%,1510754.0,,5321.0,1000.0,,101.0
75%,2639424.0,,5581.25,1000.0,,140.0


In [27]:
# save preprocessing
filename = "./data/preprocessed.csv"
df.to_csv(filename)

## Feature Engineering

In [28]:
df["words"] = df["prp_text"].str.split()
df["sentences"] = df["text"].apply(features._split_sentences)

f = pd.DataFrame()
f["author"] = df["id"]
f["word_count"] = df["word_count"]
f["sentence_count"] = df["sentence_count"]

f.describe(include='all')

Unnamed: 0,author,word_count,sentence_count
count,600.0,600.0,600.0
mean,1630972.0,1000.0,146.433333
std,960077.6,0.0,194.750795
min,215223.0,1000.0,14.0
25%,554681.0,1000.0,75.0
50%,1510754.0,1000.0,101.0
75%,2639424.0,1000.0,140.0
max,2922061.0,1000.0,3284.0


#### VOCABULARY

In [29]:
### VOCABULARY
f["type_token_ratio"] = df["words"].apply(lambda text: features.type_token_ratio(text))
f["vocab_size"] = df["words"].apply(lambda text: features.vocabulary_size(text))

f["stopword_ratio"] = df["words"].apply(features.stopword_ratio)

f["hapax_legomena_ratio"] = df["words"].apply(features.hapax_legomena)
f["hapax_dislegomena_ratio"] = df["words"].apply(features.hapax_dislegomena)

f.describe(include='all')

Unnamed: 0,author,word_count,sentence_count,type_token_ratio,vocab_size,stopword_ratio,hapax_legomena_ratio,hapax_dislegomena_ratio
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,1630972.0,1000.0,146.433333,0.471668,471.668333,0.494942,0.332532,0.065158
std,960077.6,0.0,194.750795,0.052679,52.678998,0.045516,0.059152,0.007935
min,215223.0,1000.0,14.0,0.359,359.0,0.341,0.218,0.046
25%,554681.0,1000.0,75.0,0.43475,434.75,0.46375,0.28975,0.06
50%,1510754.0,1000.0,101.0,0.4715,471.5,0.5,0.3265,0.065
75%,2639424.0,1000.0,140.0,0.505,505.0,0.527,0.368,0.07
max,2922061.0,1000.0,3284.0,0.652,652.0,0.604,0.545,0.106


#### READABILITY

In [30]:
### READABILITY
f["average_num_syllables"] = df["words"].apply(lambda text: features.average_syllables(text))
f["std_num_syllables"] = df["words"].apply(features.std_syllables)

f["average_sent_len"] = df.apply(lambda r: features.average_words_per_sentence(r["sentences"]), axis=1)
f["std_sent_len"] = df.apply(lambda r: features.std_sentence_length(r["sentences"]), axis=1)

f["average_syllables_vocabulary"] = df["words"].apply(lambda text: features.average_syllables_of_vocabulary(text))

f["average_word_len"] = df["words"].apply(features.average_word_len)
f["std_word_len"] = df["words"].apply(features.std_word_length)

In [31]:
f["flesch_reading_ease"] = df.apply(lambda r: features.flesch_reading_ease(r["sentences"], r["words"]), axis=1)
f["dale_chall_reading_ease"] = df.apply(lambda r: features.dale_chall_reading_ease(r["words"], r["sentence_count"]), axis=1)
f["gunning_fog_index"] = df.apply(lambda r: features.gunning_fog_index(r["words"], r["sentence_count"]), axis=1)

In [32]:
f.describe(include='all')

Unnamed: 0,author,word_count,sentence_count,type_token_ratio,vocab_size,stopword_ratio,hapax_legomena_ratio,hapax_dislegomena_ratio,average_num_syllables,std_num_syllables,average_sent_len,std_sent_len,average_syllables_vocabulary,average_word_len,std_word_len,flesch_reading_ease,dale_chall_reading_ease,gunning_fog_index
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,1630972.0,1000.0,146.433333,0.471668,471.668333,0.494942,0.332532,0.065158,1.406312,0.734485,73.806723,53.630812,1.669337,4.371172,2.38832,77.333621,19.940943,7.568934
std,960077.6,0.0,194.750795,0.052679,52.678998,0.045516,0.059152,0.007935,0.100124,0.111353,29.585039,30.337447,0.129046,0.284173,0.272794,10.811252,0.27168,2.797765
min,215223.0,1000.0,14.0,0.359,359.0,0.341,0.218,0.046,1.2,0.481876,20.538244,22.375227,1.382927,3.765,1.83303,16.4872,19.441604,0.961803
25%,554681.0,1000.0,75.0,0.43475,434.75,0.46375,0.28975,0.06,1.335,0.65745,55.554299,38.416563,1.581531,4.1545,2.20581,70.556982,19.780786,5.740444
50%,1510754.0,1000.0,101.0,0.4715,471.5,0.5,0.3265,0.065,1.383,0.718122,67.966235,48.777188,1.648899,4.322,2.341955,78.527084,19.917589,7.41193
75%,2639424.0,1000.0,140.0,0.505,505.0,0.527,0.368,0.07,1.47325,0.811875,85.744156,60.994252,1.757138,4.58225,2.554922,85.437283,20.087833,9.278414
max,2922061.0,1000.0,3284.0,0.652,652.0,0.604,0.545,0.106,1.742,1.205585,425.857143,378.189896,2.103131,5.158,3.474147,102.975526,22.969357,31.891429


#### POS Tagging

In [33]:
f_pos = df["words"].apply(features.pos_counts).apply(pd.Series)
f_pos.columns = [c.lower()+"_count" for c in f_pos.columns]

f_punc = df["text"].apply(features.punc_counts).apply(pd.Series)
f_punc.columns = [c.lower()+"_count" for c in f_punc.columns]

f_sent = df.apply(lambda r: features.sentence_type_counts(r["sentences"]), axis=1).apply(pd.Series)
f_sent.columns = [c.lower()+"_count" for c in f_sent.columns]

In [34]:
f = pd.concat([f,f_pos,f_punc,f_sent], axis=1)
f.describe(include='all')

Unnamed: 0,author,word_count,sentence_count,type_token_ratio,vocab_size,stopword_ratio,hapax_legomena_ratio,hapax_dislegomena_ratio,average_num_syllables,std_num_syllables,...,propn_count,sym_count,x_count,period_count,exclam_count,quest_count,simple_count,compound_count,complex_count,compound_complex_count
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,...,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,1630972.0,1000.0,146.433333,0.471668,471.668333,0.494942,0.332532,0.065158,1.406312,0.734485,...,69.536667,0.546667,0.556667,187.118333,8.445,11.751667,75.688333,20.443333,28.911667,21.39
std,960077.6,0.0,194.750795,0.052679,52.678998,0.045516,0.059152,0.007935,0.100124,0.111353,...,45.414492,1.286045,0.893375,221.852771,17.418361,17.760322,122.964487,24.361498,36.523149,18.462286
min,215223.0,1000.0,14.0,0.359,359.0,0.341,0.218,0.046,1.2,0.481876,...,4.0,0.0,0.0,27.0,0.0,0.0,2.0,0.0,1.0,4.0
25%,554681.0,1000.0,75.0,0.43475,434.75,0.46375,0.28975,0.06,1.335,0.65745,...,40.0,0.0,0.0,87.0,0.0,3.0,29.0,10.0,16.0,14.0
50%,1510754.0,1000.0,101.0,0.4715,471.5,0.5,0.3265,0.065,1.383,0.718122,...,59.5,0.0,0.0,118.5,3.0,7.0,45.0,15.0,22.0,18.0
75%,2639424.0,1000.0,140.0,0.505,505.0,0.527,0.368,0.07,1.47325,0.811875,...,90.0,1.0,1.0,196.25,10.0,14.0,68.25,22.0,32.0,23.0
max,2922061.0,1000.0,3284.0,0.652,652.0,0.604,0.545,0.106,1.742,1.205585,...,357.0,14.0,6.0,2888.0,217.0,230.0,1871.0,370.0,705.0,338.0


#### SAVE

In [35]:
f.to_csv("./data/features.csv")
f.describe(include='all')

Unnamed: 0,author,word_count,sentence_count,type_token_ratio,vocab_size,stopword_ratio,hapax_legomena_ratio,hapax_dislegomena_ratio,average_num_syllables,std_num_syllables,...,propn_count,sym_count,x_count,period_count,exclam_count,quest_count,simple_count,compound_count,complex_count,compound_complex_count
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,...,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,1630972.0,1000.0,146.433333,0.471668,471.668333,0.494942,0.332532,0.065158,1.406312,0.734485,...,69.536667,0.546667,0.556667,187.118333,8.445,11.751667,75.688333,20.443333,28.911667,21.39
std,960077.6,0.0,194.750795,0.052679,52.678998,0.045516,0.059152,0.007935,0.100124,0.111353,...,45.414492,1.286045,0.893375,221.852771,17.418361,17.760322,122.964487,24.361498,36.523149,18.462286
min,215223.0,1000.0,14.0,0.359,359.0,0.341,0.218,0.046,1.2,0.481876,...,4.0,0.0,0.0,27.0,0.0,0.0,2.0,0.0,1.0,4.0
25%,554681.0,1000.0,75.0,0.43475,434.75,0.46375,0.28975,0.06,1.335,0.65745,...,40.0,0.0,0.0,87.0,0.0,3.0,29.0,10.0,16.0,14.0
50%,1510754.0,1000.0,101.0,0.4715,471.5,0.5,0.3265,0.065,1.383,0.718122,...,59.5,0.0,0.0,118.5,3.0,7.0,45.0,15.0,22.0,18.0
75%,2639424.0,1000.0,140.0,0.505,505.0,0.527,0.368,0.07,1.47325,0.811875,...,90.0,1.0,1.0,196.25,10.0,14.0,68.25,22.0,32.0,23.0
max,2922061.0,1000.0,3284.0,0.652,652.0,0.604,0.545,0.106,1.742,1.205585,...,357.0,14.0,6.0,2888.0,217.0,230.0,1871.0,370.0,705.0,338.0
