Tasks: <br>
<br>
1- get the full lemmatized word list <br>
2- get the 95% coverage word list <br>
3- remove stop words from 95% swl <br>
4- compare ngsl coverage on full lemmatized word list

- make 95% default and changable

### Requirements

In [None]:
import numpy as np
import pandas as pd
import nltk
import spacy
import stanza
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings(action="ignore")
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

stanza.download('en')
nlp1 = spacy.load("en_core_web_sm")
nlp2 = stanza.Pipeline(lang='en', processors='tokenize,pos')


[nltk_data] Downloading package punkt to /Users/soum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/soum/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/soum/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/soum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/soum/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-25 18:32:09 INFO: Downloaded file to /Users/soum/stanza_resources/resources.json
2025-04-25 18:32:09 INFO: Downloading default packages for language: en (English) ...
2025-04-25 18:32:11 INFO: File exists: /Users/soum/stanza_resources/en/default.zip


### Main Functions

In [None]:
def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        src = file.read()
        src = src.replace("\n", " ")
    return src

In [None]:
def tokenize_text(text):
    tokens = [[j.lower() for j in word_tokenize(i)] for i in sent_tokenize(text)]

    words = []
    for sentence in tokens:
        words.extend(sentence)
    return words

In [None]:
def generate_frequncy_df(words):
    text_counts = pd.DataFrame({'word': words})
    text_counts = text_counts.groupby('word')['word'].count().reset_index(name='count')
    text_counts = text_counts.sort_values(by='count', ascending=False)
    text_counts = text_counts[text_counts['word'].str.isalpha()].reset_index(drop=True)
    return text_counts

In [None]:
def lemmatize(df):
    lemma_freq = {}
    for _, row in df.iterrows():
        word = row['word']
        count = row['count']
        doc = nlp1(word)
        token = doc[0]
        if token.like_num:
            continue
        lemma = token.lemma_
        lemma_freq[lemma] = lemma_freq.get(lemma, 0) + count

    grouped_df = pd.DataFrame(list(lemma_freq.items()), columns=['word', 'count'])
    grouped_df = grouped_df.sort_values(by='count', ascending=False).reset_index(drop=True)

    return grouped_df

In [None]:
def get_swl(df, coverage: float = 0.95):
    if not 0.0 <= coverage <= 1.0:
        raise ValueError("`coverage` must be between 0.0 and 1.0")
    
    df = df.copy()
    df['cumulative_coverage'] = df['count'].cumsum() / df['count'].sum()
    swl = df[df['cumulative_coverage'] <= coverage]
    return swl

In [None]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    mod_text = (
        text[~text['word'].isin(stop_words)]
        .reset_index(drop=True)
    )
    return mod_text

In [None]:
def remove_proper_nouns(df):
    drops = []
    for i, txt in df['word'].items():
        doc = nlp2(txt)
        if any(word.upos == 'PROPN' for sent in doc.sentences for word in sent.words):
            drops.append(i)
    return df.drop(drops).reset_index(drop=True)

In [None]:
def coverage(new_series, original_series):
    new_sum = new_series.sum()
    original_sum = original_series.sum()
    coverage = (new_sum / original_sum) * 100
    return print("Coverage: ", coverage, "%")

### Aggregate Functions

In [None]:
def get_origianl_swl(file_path):
    original_df = load_text_file(file_path)
    original_df = tokenize_text(original_df)
    original_df = generate_frequncy_df(original_df)

    return original_df

In [None]:
def get_lemmatized_swl(file_path):
    # Load the text file
    text = load_text_file(file_path)

    # Tokenize the text
    tokens = tokenize_text(text)

    # Generate frequency DataFrame
    freq_df = generate_frequncy_df(tokens)

    # Lemmatize the words
    lemmatized_df = lemmatize(freq_df)
    
    return lemmatized_df

In [None]:
def get_95_swl(lemmatized_df, 
               remove_sw: bool = False, 
               remove_pn: bool = False):
    
    # Get the 95% SWL
    swl = get_swl(lemmatized_df)
    
    # Remove stopwords
    if remove_sw:
        swl = remove_stopwords(swl)

    # Remove proper nouns
    if remove_pn:
        swl = remove_proper_nouns(swl)
    
    return swl

In [None]:
def compare_ngsl_lemmatized(lemmatized_df):
    ngsl = pd.read_csv(r"data/ngsl-v1.2.csv")
    ngsl.rename(columns={'Adjusted Frequency per Million (U)': 'count', 'Lemma': 'word'}, inplace=True)

    common_df = pd.merge(
    ngsl, lemmatized_df,
    on='word',
    how='inner',
    suffixes=('_ngsl', '_df')
    )

    coverage = float((common_df['count_df'].sum() / lemmatized_df['count'].sum())* 100)
    print(f"Coverage of NGSL in lemmatized SWL: {coverage:.2f}%")
    return coverage

In [None]:
def compare_nawl_lemmatized(lemmatized_df):
    nawl = pd.read_csv(r"data/NAWL-1.0.csv")
    nawl.rename(columns={'Word': 'word', 'U': 'count'}, inplace=True)

    common_df = pd.merge(
    nawl, lemmatized_df,
    on='word',
    how='inner',
    suffixes=('_nawl', '_df')
    )

    coverage = float((common_df['count_df'].sum() / lemmatized_df['count'].sum())* 100)
    print(f"Coverage of NAWL in lemmatized SWL: {coverage:.2f}%")
    return coverage

### Text Data Management and Analysis

In [None]:
zhai_df = load_text_file(r"data/zhai.txt")

zhai_tokens = tokenize_text(zhai_df)

zhai_tokens = generate_frequncy_df(zhai_tokens)

len(zhai_tokens)

In [None]:
zhai_df = get_lemmatized_swl(r"data/zhai.txt")

In [None]:
print(len(zhai_df))
zhai_df.head(10)

In [None]:
zhai_swl = get_95_swl(zhai_df)

In [None]:
print(len(zhai_swl))
zhai_swl.head(10)

In [None]:
zhai_swl_no_sw = get_95_swl(zhai_df, remove_sw=True)

In [None]:
print(len(zhai_swl_no_sw))
zhai_swl_no_sw.head(10)

In [None]:
zhai_nawl_coverage = compare_ngsl_lemmatized(zhai_df)

In [None]:
zhai_swl_coverage = coverage(zhai_swl['count'], zhai_df['count'])

### Alice in Wonderland

In [None]:
alice_df = load_text_file(r"data/alice.txt")

alice_tokens = tokenize_text(alice_df)

alice_tokens = generate_frequncy_df(alice_tokens)

len(alice_tokens)

In [None]:
alice_df = get_lemmatized_swl(r"data/alice.txt")

In [None]:
print(len(alice_df))
alice_df.head(10)

In [None]:
alice_swl = get_95_swl(alice_df)

In [None]:
print(len(alice_swl))
alice_swl.head(10)

In [None]:
alice_swl_no_sw = get_95_swl(alice_df, remove_sw=True)

In [None]:
print(len(alice_swl_no_sw))
alice_swl_no_sw.head(10)

In [None]:
alice_ngsl_coverage = compare_ngsl_lemmatized(alice_df)

In [None]:
alice_swl_coverage = coverage(alice_swl['count'], alice_df['count'])

### Titanic Evaluation

In [None]:
titanic_df = load_text_file(r"data/titanic.txt")
titanic_df = tokenize_text(titanic_df)
titanic_df = generate_frequncy_df(titanic_df)
print(len(titanic_df))
titanic_df.head(10)

In [None]:
titanic_df = get_95_swl(titanic_df, remove_sw=True)
print(len(titanic_df))
titanic_df.head(10)

In [None]:
titanic_lemmatized_df = get_lemmatized_swl(r"data/titanic.txt")

In [None]:
print(len(titanic_lemmatized_df))
print(titanic_lemmatized_df.head(10))

In [None]:
titanic_swl = get_95_swl(titanic_lemmatized_df)

In [None]:
print(len(titanic_swl))
print(titanic_swl.head(10))

In [None]:
titanic_swl_nosw = get_95_swl(titanic_lemmatized_df, remove_sw=True)

In [None]:
print(len(titanic_swl_nosw))
print(titanic_swl_nosw.head(10))

In [None]:
titanic_ngsl_coverage = compare_ngsl_lemmatized(titanic_lemmatized_df)

In [None]:
titanic_swl_coverage = coverage(titanic_swl['count'], titanic_lemmatized_df['count'])

### Lord of the Rings Evaluation

In [None]:
lotr_lemmatized_df = get_lemmatized_swl(r"data/Lord of the Rings - Chapter One.txt")

In [None]:
print(len(lotr_lemmatized_df))
print(lotr_lemmatized_df.head(10))

In [None]:
lotr_swl = get_95_swl(lotr_lemmatized_df)

In [None]:
print(len(lotr_swl))
print(lotr_swl.head(10))

In [None]:
lotr_swl_nosw = get_95_swl(lotr_lemmatized_df, remove_sw=True)

In [None]:
print(len(lotr_swl_nosw))
print(lotr_swl_nosw.head(10))

In [None]:
lotr_compare_ngsl_coverage = compare_ngsl_lemmatized(lotr_lemmatized_df)

In [None]:
lotr_swl_coverage = coverage(lotr_swl['count'], lotr_lemmatized_df['count'])