In [18]:
import pandas as pd
import regex as re
from pathlib import Path
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nisan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nisan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nisan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
STOPWORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()

# CONFIG
INPUT_CSV = "Shakespeare.csv"
TEXT_COL = "PlayerLine"   # change to the column that contains the text
KEEP_SENTENCE_TOKENIZATION = False
USE_LEMMATIZATION = False    # set False to use stemming instead

In [20]:
def validate_script_and_text(s, allowed_script='Latin'):
    # quick filter: remove rows with no letters or only weird unicode blocks
    if not isinstance(s, str) or s.strip() == "":
        return False
    # crude: ensure presence of at least one Latin letter
    return bool(re.search(r'\p{Latin}', s))

def normalize_whitespace(s):
    return re.sub(r'\s+', ' ', s).strip()

def remove_special_numbers_punct(s, keep_apostrophe=False):
    if keep_apostrophe:
        return re.sub(r"[^0-9A-Za-z'\s]", ' ', s)
    return re.sub(r'[^0-9A-Za-z\s]', ' ', s)

def remove_numbers(s):
    return re.sub(r'\d+', '', s)

In [21]:
def preprocess_text_raw(s):
    # 1. validate
    if not validate_script_and_text(s):
        return None
    # 2. lowercase
    s = s.lower()
    # 3. remove special chars & punctuation (choose whether to remove numbers)
    s = remove_special_numbers_punct(s, keep_apostrophe=False)
    # 4. optionally remove digits
    s = remove_numbers(s)
    # 5. normalize whitespace
    s = normalize_whitespace(s)
    return s

In [22]:
def tokenize_word(s):
    return word_tokenize(s)

def sentence_tokenize(s):
    return sent_tokenize(s)

def remove_stopwords(tokens):
    return [t for t in tokens if t not in STOPWORDS]

def lemmatize_with_spacy(tokens):
    doc = nlp(' '.join(tokens))
    return [tok.lemma_ for tok in doc]

def stem_tokens(tokens):
    return [STEMMER.stem(t) for t in tokens]

In [23]:
def segment_text_by_max_chars(s, max_chars=200):
    # naive segmentation preserving sentence boundaries when possible
    sents = sentence_tokenize(s)
    chunks = []
    cur = ""
    for sent in sents:
        if len(cur) + 1 + len(sent) <= max_chars:
            cur = (cur + " " + sent).strip()
        else:
            if cur:
                chunks.append(cur)
            if len(sent) > max_chars:
                # hard-cut long sentence
                for i in range(0, len(sent), max_chars):
                    chunks.append(sent[i:i+max_chars].strip())
                cur = ""
            else:
                cur = sent
    if cur:
        chunks.append(cur)
    return chunks

In [24]:
df = pd.read_csv(INPUT_CSV, dtype=str, keep_default_na=False)

# combine columns if needed; here we'll process TEXT_COL but you can combine Play+PlayerLine etc.
df['raw_text'] = df[TEXT_COL].fillna('')

In [25]:
processed = []

for i, row in enumerate(df.itertuples(index=False)):
    raw = row.raw_text
    s = preprocess_text_raw(raw)
    if not s:
        processed.append(None)
        continue

    # optional sentence split
    if KEEP_SENTENCE_TOKENIZATION:
        sents = sentence_tokenize(s)
    else:
        sents = [s]

    all_tokens = []

    for sent in sents:
        # word tokenization
        tokens = tokenize_word(sent)

        # remove stopwords (optional but often harmful for autocomplete)
        tokens = remove_stopwords(tokens)

        # stemming OR lemmatization (usually OFF for autocomplete)
        if USE_LEMMATIZATION:
            tokens = lemmatize_with_spacy(tokens)
        else:
            tokens = stem_tokens(tokens)

        # final cleanup
        tokens = [t for t in tokens if t.strip()]

        if tokens:
            all_tokens.extend(tokens)

    # store token list
    processed.append(all_tokens if all_tokens else None)

    if i % 5000 == 0:
        print(f"{i} rows processed")


0 rows processed
5000 rows processed
10000 rows processed
15000 rows processed
20000 rows processed
25000 rows processed
30000 rows processed
35000 rows processed
40000 rows processed
45000 rows processed
50000 rows processed
55000 rows processed
60000 rows processed
65000 rows processed
70000 rows processed
75000 rows processed
80000 rows processed
85000 rows processed
90000 rows processed
95000 rows processed
100000 rows processed
105000 rows processed
110000 rows processed


In [26]:
df['preprocessed_segments'] = processed

# explode rows if you want one segment per row (helpful for corpora)
df_exploded = df.explode('preprocessed_segments')
df_exploded = df_exploded.dropna(subset=['preprocessed_segments'])

# deduplicate normalized text
df_exploded['preprocessed_segments_norm'] = df_exploded['preprocessed_segments'].str.strip().str.lower()
df_exploded = df_exploded.drop_duplicates(subset=['preprocessed_segments_norm'])

# final filters (e.g., remove very short lines)
df_exploded['length'] = df_exploded['preprocessed_segments_norm'].str.len()
df_final = df_exploded[(df_exploded['length'] > 2)]  # tune threshold

In [27]:
OUT_DIR = Path("processed_output")
OUT_DIR.mkdir(exist_ok=True)

# full df to csv
df_final.to_csv(OUT_DIR / "shakespeare_preprocessed.csv", index=False, encoding='utf-8')

# plain text corpus (one line per segment)
with open(OUT_DIR / "shakespeare_corpus.txt", "w", encoding="utf-8") as f:
    for line in df_final['preprocessed_segments_norm'].tolist():
        f.write(line + "\n")

# excel (small datasets only)
df_final.to_excel(OUT_DIR / "shakespeare_preprocessed.xlsx", index=False)

print("Done. Saved to", OUT_DIR)

Done. Saved to processed_output
