Tasks: <br>
<br>
1- get the full lemmatized word list <br>
2- get the 95% coverage word list <br>
3- remove stop words from 95% swl <br>
4- compare ngsl coverage on full lemmatized word list

- make 95% default and changable

### Requirements

In [3]:
import numpy as np
import pandas as pd
import nltk
import spacy
import stanza
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings(action="ignore")
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

stanza.download('en')
nlp1 = spacy.load("en_core_web_sm")
nlp2 = stanza.Pipeline(lang='en', processors='tokenize,pos')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\semse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\semse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\semse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\semse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\semse\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-24 20:31:54 INFO: Downloaded file to C:\Users\semse\stanza_resources\resources.json
2025-04-24 20:31:54 INFO: Downloading default packages for language: en (English) ...
2025-04-24 20:31:55 INFO: File exists: C:\Users\semse\stanza_resources\en\default.zip
2025-04-24 20:31:58 INFO: Finished downloading models and saved to C:\Users\semse\stanza_resources
2025-04-24 20:31:59 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-24 20:31:59 INFO: Downloaded file to C:\Users\semse\stanza_resources\resources.json
2025-04-24 20:32:00 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

2025-04-24 20:32:00 INFO: Using device: cpu
2025-04-24 20:32:00 INFO: Loading: tokenize
2025-04-24 20:32:00 INFO: Loading: mwt
2025-04-24 20:32:00 INFO: Loading: pos
2025-04-24 20:32:04 INFO: Done loading processors!


### Main Functions

In [4]:
def load_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        src = file.read()
        src = src.replace("\n", " ")
    return src

In [5]:
def tokenize_text(text):
    tokens = [[j.lower() for j in word_tokenize(i)] for i in sent_tokenize(text)]

    words = []
    for sentence in tokens:
        words.extend(sentence)
    return words

In [6]:
def generate_frequncy_df(words):
    text_counts = pd.DataFrame({'word': words})
    text_counts = text_counts.groupby('word')['word'].count().reset_index(name='count')
    text_counts = text_counts.sort_values(by='count', ascending=False)
    text_counts = text_counts[text_counts['word'].str.isalpha()].reset_index(drop=True)
    return text_counts

In [7]:
def lemmatize(df):
    lemma_freq = {}
    for _, row in df.iterrows():
        word = row['word']
        count = row['count']
        doc = nlp1(word)
        token = doc[0]
        if token.like_num:
            continue
        lemma = token.lemma_
        lemma_freq[lemma] = lemma_freq.get(lemma, 0) + count

    grouped_df = pd.DataFrame(list(lemma_freq.items()), columns=['word', 'count'])
    grouped_df = grouped_df.sort_values(by='count', ascending=False).reset_index(drop=True)

    return grouped_df

In [8]:
def get_swl(df, coverage: float = 0.95):
    if not 0.0 <= coverage <= 1.0:
        raise ValueError("`coverage` must be between 0.0 and 1.0")
    
    df = df.copy()
    df['cumulative_coverage'] = df['count'].cumsum() / df['count'].sum()
    swl = df[df['cumulative_coverage'] <= coverage]
    return swl

In [9]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    mod_text = (
        text[~text['word'].isin(stop_words)]
        .reset_index(drop=True)
    )
    return mod_text

In [10]:
def remove_proper_nouns(df):
    drops = []
    for i, txt in df['word'].items():
        doc = nlp2(txt)
        if any(word.upos == 'PROPN' for sent in doc.sentences for word in sent.words):
            drops.append(i)
    return df.drop(drops).reset_index(drop=True)

In [11]:
def coverage(new_series, original_series):
    new_sum = new_series.sum()
    original_sum = original_series.sum()
    coverage = (new_sum / original_sum) * 100
    return print("Coverage: ", coverage, "%")

### Aggregate Functions

In [12]:
def get_lemmatized_swl(file_path):
    # Load the text file
    text = load_text_file(file_path)

    # Tokenize the text
    tokens = tokenize_text(text)

    # Generate frequency DataFrame
    freq_df = generate_frequncy_df(tokens)

    # Lemmatize the words
    lemmatized_df = lemmatize(freq_df)
    
    return lemmatized_df

In [13]:
def get_95_swl(lemmatized_df, 
               remove_sw: bool = False, 
               remove_pn: bool = False):
    
    # Get the 95% SWL
    swl = get_swl(lemmatized_df)
    
    # Remove stopwords
    if remove_sw:
        swl = remove_stopwords(swl)

    # Remove proper nouns
    if remove_pn:
        swl = remove_proper_nouns(swl)
    
    return swl

In [14]:
def compare_ngsl_lemmatized(lemmatized_df):
    ngsl = pd.read_csv("ngsl-v1.2.csv")
    ngsl.rename(columns={'Adjusted Frequency per Million (U)': 'count', 'Lemma': 'word'}, inplace=True)

    common_df = pd.merge(
    ngsl, lemmatized_df,
    on='word',
    how='inner',
    suffixes=('_ngsl', '_df')
    )

    # df1_filtered = df1[df1['word'].isin(df2['word'])].reset_index(drop=True)
    

    coverage = float((common_df['count_df'].sum() / lemmatized_df['count'].sum())* 100)
    print(f"Coverage of NGSL in lemmatized SWL: {coverage:.2f}%")
    return coverage

### Titanic Evaluation

In [31]:
titanic_df = load_text_file("titanic.txt")
titanic_df = tokenize_text(titanic_df)
titanic_df = generate_frequncy_df(titanic_df)
print(len(titanic_df))
titanic_df.head(10)

5341


Unnamed: 0,word,count
0,the,3013
1,and,1146
2,to,1102
3,a,1049
4,of,840
5,in,665
6,rose,664
7,is,602
8,it,556
9,jack,529


In [33]:
titanic_df = get_95_swl(titanic_df, remove_sw=True)
print(len(titanic_df))
titanic_df.head(10)

3081


Unnamed: 0,word,count,cumulative_coverage
0,rose,664,0.1984
1,jack,529,0.237874
2,cal,227,0.344619
3,cut,200,0.3544
4,water,183,0.37209
5,deck,182,0.376348
6,like,179,0.388937
7,back,163,0.40906
8,boat,140,0.415635
9,ship,127,0.421883


In [15]:
titanic_lemmatized_df = get_lemmatized_swl("titanic.txt")

In [16]:
print(len(titanic_lemmatized_df))
print(titanic_lemmatized_df.head(10))

4050
   word  count
0   the   3013
1   and   1146
2    to   1102
3    be   1099
4     a   1049
5   she    899
6    of    840
7  rise    683
8    in    665
9    he    592


In [17]:
titanic_swl = get_95_swl(titanic_lemmatized_df)

In [18]:
print(len(titanic_swl))
print(titanic_swl.head(10))

2049
   word  count  cumulative_coverage
0   the   3013             0.071261
1   and   1146             0.098366
2    to   1102             0.124429
3    be   1099             0.150422
4     a   1049             0.175232
5   she    899             0.196495
6    of    840             0.216362
7  rise    683             0.232516
8    in    665             0.248244
9    he    592             0.262245


In [19]:
titanic_swl_nosw = get_95_swl(titanic_lemmatized_df, remove_sw=True)

In [20]:
print(len(titanic_swl_nosw))
print(titanic_swl_nosw.head(10))

1946
   word  count  cumulative_coverage
0  rise    683             0.232516
1  jack    529             0.287907
2     I    519             0.300182
3    go    232             0.376954
4   cal    227             0.382323
5   cut    211             0.397720
6   see    205             0.402569
7  deck    192             0.411793
8  look    187             0.420662
9  boat    184             0.425014


In [21]:
titanic_ngsl_coverage = compare_ngsl_lemmatized(titanic_lemmatized_df)

Coverage of NGSL in lemmatized SWL: 79.36%


In [22]:
titanic_swl_coverage = coverage(titanic_swl['count'], titanic_lemmatized_df['count'])

Coverage:  94.9977531278825 %


### Lord of the Rings Evaluation

In [23]:
lotr_lemmatized_df = get_lemmatized_swl("Lord of the Rings - Chapter One.txt")

In [24]:
print(len(lotr_lemmatized_df))
print(lotr_lemmatized_df.head(10))

1394
   word  count
0   the    496
1   and    379
2    be    378
3    of    287
4    he    224
5     I    210
6    to    201
7     a    167
8  have    158
9   you    154


In [25]:
lotr_swl = get_95_swl(lotr_lemmatized_df)

In [26]:
print(len(lotr_swl))
print(lotr_swl.head(10))

944
   word  count  cumulative_coverage
0   the    496             0.055129
1   and    379             0.097255
2    be    378             0.139269
3    of    287             0.171168
4    he    224             0.196065
5     I    210             0.219406
6    to    201             0.241747
7     a    167             0.260309
8  have    158             0.277870
9   you    154             0.294987


In [27]:
lotr_swl_nosw = get_95_swl(lotr_lemmatized_df, remove_sw=True)

In [28]:
print(len(lotr_swl_nosw))
print(lotr_swl_nosw.head(10))

837
      word  count  cumulative_coverage
0        I    210             0.219406
1      say     94             0.368901
2    frodo     92             0.379126
3     come     42             0.445148
4      see     38             0.453929
5  gandalf     36             0.462043
6    think     36             0.466044
7    bilbo     36             0.470046
8     many     34             0.473825
9     look     33             0.481272


In [29]:
lotr_compare_ngsl_coverage = compare_ngsl_lemmatized(lotr_lemmatized_df)

Coverage of NGSL in lemmatized SWL: 83.83%


In [30]:
lotr_swl_coverage = coverage(lotr_swl['count'], lotr_lemmatized_df['count'])

Coverage:  94.99833277759254 %
