In [1]:
import pandas as pd
import regex as re
from pathlib import Path
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nisan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nisan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nisan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
STOPWORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()

INPUT_CSV = "Shakespeare.csv"
TEXT_COL = "PlayerLine"
KEEP_SENTENCE_TOKENIZATION = False
USE_LEMMATIZATION = True

In [3]:
def validate_script_and_text(s, allowed_script='Latin'):
    if not isinstance(s, str) or s.strip() == "":
        return False
    return bool(re.search(r'\p{Latin}', s))

def normalize_whitespace(s):
    return re.sub(r'\s+', ' ', s).strip()

def remove_special_numbers_punct(s, keep_apostrophe=False):
    if keep_apostrophe:
        return re.sub(r"[^0-9A-Za-z'\s]", ' ', s)
    return re.sub(r'[^0-9A-Za-z\s]', ' ', s)

def remove_numbers(s):
    return re.sub(r'\d+', '', s)

In [4]:
def preprocess_text_raw(s):
    if not validate_script_and_text(s):
        return None
    s = s.lower()
    s = remove_special_numbers_punct(s, keep_apostrophe=False)
    s = remove_numbers(s)
    s = normalize_whitespace(s)
    return s

In [5]:
def tokenize_word(s):
    return word_tokenize(s)

def sentence_tokenize(s):
    return sent_tokenize(s)

def remove_stopwords(tokens):
    return [t for t in tokens if t not in STOPWORDS]

def lemmatize_with_spacy(tokens):
    doc = nlp(' '.join(tokens))
    return [tok.lemma_ for tok in doc]

def stem_tokens(tokens):
    return [STEMMER.stem(t) for t in tokens]

In [6]:
def segment_text_by_max_chars(s, max_chars=200):
    sents = sentence_tokenize(s)
    chunks = []
    cur = ""
    for sent in sents:
        if len(cur) + 1 + len(sent) <= max_chars:
            cur = (cur + " " + sent).strip()
        else:
            if cur:
                chunks.append(cur)
            if len(sent) > max_chars:
                for i in range(0, len(sent), max_chars):
                    chunks.append(sent[i:i+max_chars].strip())
                cur = ""
            else:
                cur = sent
    if cur:
        chunks.append(cur)
    return chunks

In [7]:
df = pd.read_csv(INPUT_CSV, dtype=str, keep_default_na=False)

df['raw_text'] = df[TEXT_COL].fillna('')

In [8]:
processed = []

for i, row in enumerate(df.itertuples(index=False)):
    raw = row.raw_text
    s = preprocess_text_raw(raw)
    if not s:
        processed.append(None)
        continue

    if KEEP_SENTENCE_TOKENIZATION:
        sents = sentence_tokenize(s)
    else:
        sents = [s]

    all_tokens = []

    for sent in sents:
        tokens = tokenize_word(sent)

        tokens = remove_stopwords(tokens)

        if USE_LEMMATIZATION:
            tokens = lemmatize_with_spacy(tokens)
        else:
            tokens = stem_tokens(tokens)

        tokens = [t for t in tokens if t.strip()]

        if tokens:
            all_tokens.extend(tokens)

    processed.append(all_tokens if all_tokens else None)

    if i % 5000 == 0:
        print(f"{i} rows processed")


0 rows processed
5000 rows processed
10000 rows processed
15000 rows processed
20000 rows processed
25000 rows processed
30000 rows processed
35000 rows processed
40000 rows processed
45000 rows processed
50000 rows processed
55000 rows processed
60000 rows processed
65000 rows processed
70000 rows processed
75000 rows processed
80000 rows processed
85000 rows processed
90000 rows processed
95000 rows processed
100000 rows processed
105000 rows processed
110000 rows processed


In [9]:
df['preprocessed_segments'] = processed

df_exploded = df.explode('preprocessed_segments')
df_exploded = df_exploded.dropna(subset=['preprocessed_segments'])

df_exploded['preprocessed_segments_norm'] = df_exploded['preprocessed_segments'].str.strip().str.lower()
df_exploded = df_exploded.drop_duplicates(subset=['preprocessed_segments_norm'])

df_exploded['length'] = df_exploded['preprocessed_segments_norm'].str.len()
df_final = df_exploded[(df_exploded['length'] > 2)]

In [10]:
OUT_DIR = Path("processed_output")
OUT_DIR.mkdir(exist_ok=True)

df_final.to_csv(OUT_DIR / "shakespeare_preprocessed_l.csv", index=False, encoding='utf-8')

with open(OUT_DIR / "shakespeare_corpus_l.txt", "w", encoding="utf-8") as f:
    for line in df_final['preprocessed_segments_norm'].tolist():
        f.write(line + "\n")

df_final.to_excel(OUT_DIR / "shakespeare_preprocessed_l.xlsx", index=False)

print("Done. Saved to", OUT_DIR)

Done. Saved to processed_output
