In [73]:
import pandas as pd

In [74]:
from nltk.tokenize import sent_tokenize
import re

with open('elm.txt', 'r', encoding="utf-8") as file:
    text = file.read()

# Remove links from the text.
text = re.sub(r'http\S+', '', text)

# Tokenize text into sentences.
sentences = sent_tokenize(text)

In [75]:
df = pd.DataFrame(sentences, columns=["sentence"])

In [76]:
# Remove any leading or trailing whitespaces.
df['sentence'] = df['sentence'].str.strip()

In [77]:
df = df.dropna()
df = df[df["sentence"] != ""]
df = df.reset_index(drop=True)

In [78]:
len(df)

80724

In [79]:
df.head()

Unnamed: 0,sentence
0,"Effectiveness of treatment, lack of knowledge ..."
1,Patients seem better able to take prescribed m...
2,"For short-term regimens, adherence to medicati..."
3,"Writing out\nadvice to patients, including cha..."
4,Because low functional health literacy is comm...


### Stop words

In [80]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rejsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rejsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [81]:
from nltk.tokenize import word_tokenize

def remove_stopwords(sentence):
    word_tokens = word_tokenize(sentence)
    filtered_sentence = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_sentence)

df["sentence"] = df["sentence"].apply(remove_stopwords)

In [82]:
df.head()

Unnamed: 0,sentence
0,"Effectiveness treatment , lack knowledge conse..."
1,Patients seem better able take prescribed medi...
2,"short-term regimens , adherence medications im..."
3,"Writing advice patients , including changes me..."
4,low functional health literacy common ( almost...


In [83]:
import re

def clean_sentence(sentence):
    # Remove non-alphabetical characters and leave single whitespaces.
    cleaned_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    # Replace multiple whitespaces with a single whitespace.
    cleaned_sentence = re.sub(r'\s+', ' ', cleaned_sentence)
    return cleaned_sentence.lower().strip()  # Strip leading and trailing whitespaces and lowercase.

df["sentence"] = df["sentence"].apply(clean_sentence)

In [84]:
df.head()

Unnamed: 0,sentence
0,effectiveness treatment lack knowledge consequ...
1,patients seem better able take prescribed medi...
2,shortterm regimens adherence medications impro...
3,writing advice patients including changes medi...
4,low functional health literacy common almost h...


In [85]:
def extract_last_word(df):
    # Extract last word from each sentence and place it in a new column.
    df["last_word"] = df["sentence"].apply(lambda x: x.split(" ")[-1])
    # Remove the last word from each sentence.
    df["sentence"] = df["sentence"].apply(lambda x: " ".join(x.split()[:-1]))
    return df

df = extract_last_word(df)

In [86]:
df["sentence"][0]

'effectiveness treatment lack knowledge consequences poor adherence regimen complexity treatment side'

In [87]:
df["last_word"]

0             effects
1                home
2        instructions
3             helpful
4           effective
             ...     
80719         zykadia
80720         zymaxid
80721         zyprexa
80722          zytiga
80723     abiraterone
Name: last_word, Length: 80724, dtype: object

In [88]:
df.head()

Unnamed: 0,sentence,last_word
0,effectiveness treatment lack knowledge consequ...,effects
1,patients seem better able take prescribed medi...,home
2,shortterm regimens adherence medications impro...,instructions
3,writing advice patients including changes medi...,helpful
4,low functional health literacy common almost h...,effective


In [97]:
len(set(df["last_word"].values))

10455

In [98]:
df["last_word"].value_counts()

last_word
al                2972
                  1412
disease           1210
table              977
patients           768
                  ... 
aciduria             1
alkalemia            1
formaldehyde         1
normochloremic       1
abiraterone          1
Name: count, Length: 10455, dtype: int64