Tasks: <br>
<br>
1- get the full lemmatized word list <br>
2- get the 95% coverage word list <br>
3- remove stop words from 95% swl <br>
4- compare ngsl coverage on full lemmatized word list

- make 95% default and changable

### Requirements

In [2]:
import numpy as np
import pandas as pd
import nltk
import spacy
import stanza
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings(action="ignore")
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

stanza.download('en')
nlp1 = spacy.load("en_core_web_sm")
nlp2 = stanza.Pipeline(lang='en', processors='tokenize,pos')


[nltk_data] Downloading package punkt to /Users/soum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/soum/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/soum/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/soum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/soum/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-25 18:32:09 INFO: Downloaded file to /Users/soum/stanza_resources/resources.json
2025-04-25 18:32:09 INFO: Downloading default packages for language: en (English) ...
2025-04-25 18:32:11 INFO: File exists: /Users/soum/stanza_resources/en/default.zip
2025-04-25 18:32:14 INFO: Finished downloading models and saved to /Users/soum/stanza_resources
2025-04-25 18:32:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-25 18:32:14 INFO: Downloaded file to /Users/soum/stanza_resources/resources.json
2025-04-25 18:32:15 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

2025-04-25 18:32:15 INFO: Using device: cpu
2025-04-25 18:32:15 INFO: Loading: tokenize
2025-04-25 18:32:16 INFO: Loading: mwt
2025-04-25 18:32:16 INFO: Loading: pos
2025-04-25 18:32:17 INFO: Done loading processors!


### Main Functions

In [3]:
def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        src = file.read()
        src = src.replace("\n", " ")
    return src

In [4]:
def tokenize_text(text):
    tokens = [[j.lower() for j in word_tokenize(i)] for i in sent_tokenize(text)]

    words = []
    for sentence in tokens:
        words.extend(sentence)
    return words

In [5]:
def generate_frequncy_df(words):
    text_counts = pd.DataFrame({'word': words})
    text_counts = text_counts.groupby('word')['word'].count().reset_index(name='count')
    text_counts = text_counts.sort_values(by='count', ascending=False)
    text_counts = text_counts[text_counts['word'].str.isalpha()].reset_index(drop=True)
    return text_counts

In [6]:
def lemmatize(df):
    lemma_freq = {}
    for _, row in df.iterrows():
        word = row['word']
        count = row['count']
        doc = nlp1(word)
        token = doc[0]
        if token.like_num:
            continue
        lemma = token.lemma_
        lemma_freq[lemma] = lemma_freq.get(lemma, 0) + count

    grouped_df = pd.DataFrame(list(lemma_freq.items()), columns=['word', 'count'])
    grouped_df = grouped_df.sort_values(by='count', ascending=False).reset_index(drop=True)

    return grouped_df

In [7]:
def get_swl(df, coverage: float = 0.95):
    if not 0.0 <= coverage <= 1.0:
        raise ValueError("`coverage` must be between 0.0 and 1.0")
    
    df = df.copy()
    df['cumulative_coverage'] = df['count'].cumsum() / df['count'].sum()
    swl = df[df['cumulative_coverage'] <= coverage]
    return swl

In [8]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    mod_text = (
        text[~text['word'].isin(stop_words)]
        .reset_index(drop=True)
    )
    return mod_text

In [9]:
def remove_proper_nouns(df):
    drops = []
    for i, txt in df['word'].items():
        doc = nlp2(txt)
        if any(word.upos == 'PROPN' for sent in doc.sentences for word in sent.words):
            drops.append(i)
    return df.drop(drops).reset_index(drop=True)

In [10]:
def coverage(new_series, original_series):
    new_sum = new_series.sum()
    original_sum = original_series.sum()
    coverage = (new_sum / original_sum) * 100
    return print("Coverage: ", coverage, "%")

### Aggregate Functions

In [11]:
def get_origianl_swl(file_path):
    original_df = load_text_file(file_path)
    original_df = tokenize_text(original_df)
    original_df = generate_frequncy_df(original_df)

    return original_df

In [12]:
def get_lemmatized_swl(file_path):
    # Load the text file
    text = load_text_file(file_path)

    # Tokenize the text
    tokens = tokenize_text(text)

    # Generate frequency DataFrame
    freq_df = generate_frequncy_df(tokens)

    # Lemmatize the words
    lemmatized_df = lemmatize(freq_df)
    
    return lemmatized_df

In [13]:
def get_95_swl(lemmatized_df, 
               remove_sw: bool = False, 
               remove_pn: bool = False):
    
    # Get the 95% SWL
    swl = get_swl(lemmatized_df)
    
    # Remove stopwords
    if remove_sw:
        swl = remove_stopwords(swl)

    # Remove proper nouns
    if remove_pn:
        swl = remove_proper_nouns(swl)
    
    return swl

In [14]:
def compare_ngsl_lemmatized(lemmatized_df):
    ngsl = pd.read_csv(r"data/ngsl-v1.2.csv")
    ngsl.rename(columns={'Adjusted Frequency per Million (U)': 'count', 'Lemma': 'word'}, inplace=True)

    common_df = pd.merge(
    ngsl, lemmatized_df,
    on='word',
    how='inner',
    suffixes=('_ngsl', '_df')
    )

    coverage = float((common_df['count_df'].sum() / lemmatized_df['count'].sum())* 100)
    print(f"Coverage of NGSL in lemmatized SWL: {coverage:.2f}%")
    return coverage

In [15]:
def compare_nawl_lemmatized(lemmatized_df):
    nawl = pd.read_csv(r"data/NAWL-1.0.csv")
    nawl.rename(columns={'Word': 'word', 'U': 'count'}, inplace=True)

    common_df = pd.merge(
    nawl, lemmatized_df,
    on='word',
    how='inner',
    suffixes=('_nawl', '_df')
    )

    coverage = float((common_df['count_df'].sum() / lemmatized_df['count'].sum())* 100)
    print(f"Coverage of NAWL in lemmatized SWL: {coverage:.2f}%")
    return coverage

### Text Data Management and Analysis

In [16]:
zhai_df = load_text_file(r"data/zhai.txt")

zhai_tokens = tokenize_text(zhai_df)

zhai_tokens = generate_frequncy_df(zhai_tokens)

len(zhai_tokens)

6330

In [17]:
zhai_df = get_lemmatized_swl(r"data/zhai.txt")

In [18]:
print(len(zhai_df))
zhai_df.head(10)

4621


Unnamed: 0,word,count
0,the,10526
1,be,6202
2,of,4873
3,a,4836
4,to,4643
5,in,4059
6,we,3695
7,and,3020
8,that,2007
9,this,1827


In [19]:
zhai_swl = get_95_swl(zhai_df)

In [20]:
print(len(zhai_swl))
zhai_swl.head(10)

1423


Unnamed: 0,word,count,cumulative_coverage
0,the,10526,0.060444
1,be,6202,0.096058
2,of,4873,0.124041
3,a,4836,0.151811
4,to,4643,0.178473
5,in,4059,0.201781
6,we,3695,0.222999
7,and,3020,0.240341
8,that,2007,0.251866
9,this,1827,0.262358


In [21]:
zhai_swl_no_sw = get_95_swl(zhai_df, remove_sw=True)

In [22]:
print(len(zhai_swl_no_sw))
zhai_swl_no_sw.head(10)

1322


Unnamed: 0,word,count,cumulative_coverage
0,word,1629,0.281727
1,text,1621,0.291035
2,document,1388,0.316221
3,model,1289,0.331232
4,datum,1179,0.338002
5,topic,1045,0.344003
6,p,1017,0.355769
7,use,1013,0.361586
8,would,740,0.381007
9,probability,695,0.384997


In [23]:
zhai_nawl_coverage = compare_ngsl_lemmatized(zhai_df)

Coverage of NGSL in lemmatized SWL: 80.13%


In [24]:
zhai_swl_coverage = coverage(zhai_swl['count'], zhai_df['count'])

Coverage:  94.99781789783168 %


### Alice in Wonderland

In [25]:
alice_df = load_text_file(r"data/alice.txt")

alice_tokens = tokenize_text(alice_df)

alice_tokens = generate_frequncy_df(alice_tokens)

len(alice_tokens)

1432

In [26]:
alice_df = get_lemmatized_swl(r"data/alice.txt")

In [27]:
print(len(alice_df))
alice_df.head(10)

1131


Unnamed: 0,word,count
0,the,634
1,she,350
2,and,337
3,be,309
4,a,277
5,to,249
6,I,201
7,of,200
8,it,186
9,alice,170


In [28]:
alice_swl = get_95_swl(alice_df)

In [29]:
print(len(alice_swl))
alice_swl.head(10)

664


Unnamed: 0,word,count,cumulative_coverage
0,the,634,0.066892
1,she,350,0.103819
2,and,337,0.139375
3,be,309,0.171977
4,a,277,0.201203
5,to,249,0.227474
6,I,201,0.248681
7,of,200,0.269783
8,it,186,0.289407
9,alice,170,0.307343


In [30]:
alice_swl_no_sw = get_95_swl(alice_df, remove_sw=True)

In [31]:
print(len(alice_swl_no_sw))
alice_swl_no_sw.head(10)

569


Unnamed: 0,word,count,cumulative_coverage
0,I,201,0.248681
1,alice,170,0.307343
2,say,160,0.341422
3,go,66,0.421186
4,little,58,0.440494
5,come,45,0.471407
6,get,43,0.475944
7,think,40,0.488711
8,find,37,0.512872
9,like,36,0.51667


In [32]:
alice_ngsl_coverage = compare_ngsl_lemmatized(alice_df)

Coverage of NGSL in lemmatized SWL: 88.75%


In [33]:
alice_swl_coverage = coverage(alice_swl['count'], alice_df['count'])

Coverage:  94.98839417598649 %


### Titanic Evaluation

In [34]:
titanic_df = load_text_file(r"data/titanic.txt")
titanic_df = tokenize_text(titanic_df)
titanic_df = generate_frequncy_df(titanic_df)
print(len(titanic_df))
titanic_df.head(10)

5341


Unnamed: 0,word,count
0,the,3013
1,and,1146
2,to,1102
3,a,1049
4,of,840
5,in,665
6,rose,664
7,is,602
8,it,556
9,jack,529


In [35]:
titanic_df = get_95_swl(titanic_df, remove_sw=True)
print(len(titanic_df))
titanic_df.head(10)

3080


Unnamed: 0,word,count,cumulative_coverage
0,rose,664,0.1984
1,jack,529,0.237874
2,cal,227,0.344619
3,cut,200,0.3544
4,water,183,0.37209
5,deck,182,0.376348
6,like,179,0.384749
7,back,163,0.40906
8,boat,140,0.418911
9,ship,127,0.421883


In [36]:
titanic_lemmatized_df = get_lemmatized_swl(r"data/titanic.txt")

In [37]:
print(len(titanic_lemmatized_df))
print(titanic_lemmatized_df.head(10))

4050
   word  count
0   the   3013
1   and   1146
2    to   1102
3    be   1099
4     a   1049
5   she    899
6    of    840
7  rise    683
8    in    665
9    he    592


In [38]:
titanic_swl = get_95_swl(titanic_lemmatized_df)

In [39]:
print(len(titanic_swl))
print(titanic_swl.head(10))

2049
   word  count  cumulative_coverage
0   the   3013             0.071261
1   and   1146             0.098366
2    to   1102             0.124429
3    be   1099             0.150422
4     a   1049             0.175232
5   she    899             0.196495
6    of    840             0.216362
7  rise    683             0.232516
8    in    665             0.248244
9    he    592             0.262245


In [40]:
titanic_swl_nosw = get_95_swl(titanic_lemmatized_df, remove_sw=True)

In [41]:
print(len(titanic_swl_nosw))
print(titanic_swl_nosw.head(10))

1948
   word  count  cumulative_coverage
0  rise    683             0.232516
1  jack    529             0.287907
2     I    519             0.300182
3    go    232             0.376954
4   cal    227             0.382323
5   cut    211             0.397720
6   see    205             0.402569
7  deck    192             0.411793
8  look    187             0.420662
9  boat    184             0.425014


In [42]:
titanic_ngsl_coverage = compare_ngsl_lemmatized(titanic_lemmatized_df)

Coverage of NGSL in lemmatized SWL: 79.36%


In [43]:
titanic_swl_coverage = coverage(titanic_swl['count'], titanic_lemmatized_df['count'])

Coverage:  94.9977531278825 %


### Lord of the Rings Evaluation

In [44]:
lotr_lemmatized_df = get_lemmatized_swl(r"data/Lord of the Rings - Chapter One.txt")

In [45]:
print(len(lotr_lemmatized_df))
print(lotr_lemmatized_df.head(10))

1394
   word  count
0   the    496
1   and    379
2    be    378
3    of    287
4    he    224
5     I    210
6    to    201
7     a    167
8  have    158
9   you    154


In [46]:
lotr_swl = get_95_swl(lotr_lemmatized_df)

In [47]:
print(len(lotr_swl))
print(lotr_swl.head(10))

944
   word  count  cumulative_coverage
0   the    496             0.055129
1   and    379             0.097255
2    be    378             0.139269
3    of    287             0.171168
4    he    224             0.196065
5     I    210             0.219406
6    to    201             0.241747
7     a    167             0.260309
8  have    158             0.277870
9   you    154             0.294987


In [48]:
lotr_swl_nosw = get_95_swl(lotr_lemmatized_df, remove_sw=True)

In [49]:
print(len(lotr_swl_nosw))
print(lotr_swl_nosw.head(10))

835
      word  count  cumulative_coverage
0        I    210             0.219406
1      say     94             0.368901
2    frodo     92             0.379126
3     come     42             0.445148
4      see     38             0.453929
5  gandalf     36             0.462043
6    bilbo     36             0.466044
7    think     36             0.470046
8     many     34             0.477604
9     look     33             0.481272


In [50]:
lotr_compare_ngsl_coverage = compare_ngsl_lemmatized(lotr_lemmatized_df)

Coverage of NGSL in lemmatized SWL: 83.83%


In [51]:
lotr_swl_coverage = coverage(lotr_swl['count'], lotr_lemmatized_df['count'])

Coverage:  94.99833277759254 %
