In [17]:
## load text
enc='utf-8'
with open("African_helmeted_turtle.txt","r",encoding=enc) as f:
    book=f.read()
    print(book)

The African helmeted turtle #$%!!! (Pelomedusa subrufa) is a species of side-necked terrapin in the family Pelomedusidae. The species naturally occurs in fresh and stagnant water bodies throughout much of sub-Saharan Africa, and in southern Yemen. It is omnivorous, with its diet consisting mainly of aquatic invertebrates, small fish, and vegetation. It is typically a small turtle, with most individuals being less than 20 centimetres (7.9 inches) in straight carapace length. The female lays two to ten eggs on average, normally during late spring and early summer. The eggs are placed in a flask-shaped nest about 4 to 7 inches (10 to 18 centimetres) deep and hatch in 75 to 90 days. This African helmeted turtle was photographed in Phinda Private Game Reserve, South Africa.


In [18]:
!pip install nltk



In [19]:
import re
import nltk

## TEXT PREPROCESSING

### Clean text

In [20]:
def clean_text(text):
    pattern=r'[^a-zA-Z0-9\s]'
    cleaned_text=re.sub(pattern,'',text)
    return cleaned_text

### Tokenize text

In [21]:
def nltk_tokenizer(text):
    from nltk.tokenize import sent_tokenize, word_tokenize
    nltk.download('punkt_tab')

    #token=sent_tokenize(text)
    token=word_tokenize(text)
    return token

### Remove Stopwords

In [22]:
def nltk_remove_stopwords(text):
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    en=stopwords.words("english")
    stopword_filtered_token=[word for word in text if word.casefold() not in en]
    return stopword_filtered_token

### Stem the filtered list

In [26]:
#  use PorterStemmer
def nltk_text_stem(token):
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    stemmed_token=[stemmer.stem(word) for word in token]
    #print(stemmed_token)
    return stemmed_token

### Lemmatize the text

In [29]:
def nltk_text_lemmatize(token):
    from nltk.stem import WordNetLemmatizer
    nltk.download('wordnet')
    return [WordNetLemmatizer().lemmatize(word) for word in token]

### Parts of speech tagging

In [33]:
def nltk_pos_tag(token):
    nltk.download('averaged_perceptron_tagger_eng')
    pos_tag=nltk.pos_tag(token)
    return pos_tag

### Named Entity Recognition

In [34]:
def nltk_extract_ne(token):
    nltk.download('maxent_ne_chunker_tab')
    tree = nltk.ne_chunk(token,binary=True)
    return set(
        " ".join(i[0] for i in t)
        for t in tree
        if hasattr(t, "label") and t.label() == "NE"
    )

### Code

In [35]:
cleaned_text=clean_text(book)
print("CLEANED TEXT:",cleaned_text)
print("----------")
token=nltk_tokenizer(cleaned_text)
print("TOKEN:",token)
print("----------")
stopword_filtered_list=nltk_remove_stopwords(token)
print("STOPWORD FILTERED LIST:",stopword_filtered_list)
print("----------")
stemmed_list=nltk_text_stem(stopword_filtered_list)
print("STEMMED LIST:",stemmed_list)
print("----------")
lemmatized_list=nltk_text_lemmatize(stopword_filtered_list)
print("LEMMATISED LIST:",lemmatized_list)
print("----------")
pos_tagged=nltk_pos_tag(lemmatized_list)
print("POS TAG:",pos_tagged)
print("----------")
ner_tagged=nltk_extract_ne(pos_tagged)
print("NER TAGGED:",ner_tagged)

CLEANED TEXT: The African helmeted turtle  Pelomedusa subrufa is a species of sidenecked terrapin in the family Pelomedusidae The species naturally occurs in fresh and stagnant water bodies throughout much of subSaharan Africa and in southern Yemen It is omnivorous with its diet consisting mainly of aquatic invertebrates small fish and vegetation It is typically a small turtle with most individuals being less than 20 centimetres 79 inches in straight carapace length The female lays two to ten eggs on average normally during late spring and early summer The eggs are placed in a flaskshaped nest about 4 to 7 inches 10 to 18 centimetres deep and hatch in 75 to 90 days This African helmeted turtle was photographed in Phinda Private Game Reserve South Africa
----------
TOKEN: ['The', 'African', 'helmeted', 'turtle', 'Pelomedusa', 'subrufa', 'is', 'a', 'species', 'of', 'sidenecked', 'terrapin', 'in', 'the', 'family', 'Pelomedusidae', 'The', 'species', 'naturally', 'occurs', 'in', 'fresh', 'a

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rajes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rajes\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\rajes\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\rajes\AppData\Roaming\nltk_data...


POS TAG: [('African', 'JJ'), ('helmeted', 'VBD'), ('turtle', 'JJ'), ('Pelomedusa', 'NNP'), ('subrufa', 'NN'), ('specie', 'NN'), ('sidenecked', 'VBD'), ('terrapin', 'JJ'), ('family', 'NN'), ('Pelomedusidae', 'NNP'), ('specie', 'NN'), ('naturally', 'RB'), ('occurs', 'VBZ'), ('fresh', 'JJ'), ('stagnant', 'JJ'), ('water', 'NN'), ('body', 'NN'), ('throughout', 'IN'), ('much', 'JJ'), ('subSaharan', 'NN'), ('Africa', 'NNP'), ('southern', 'JJ'), ('Yemen', 'NNP'), ('omnivorous', 'JJ'), ('diet', 'JJ'), ('consisting', 'VBG'), ('mainly', 'RB'), ('aquatic', 'JJ'), ('invertebrate', 'NN'), ('small', 'JJ'), ('fish', 'JJ'), ('vegetation', 'NN'), ('typically', 'RB'), ('small', 'JJ'), ('turtle', 'JJ'), ('individual', 'JJ'), ('less', 'JJR'), ('20', 'CD'), ('centimetre', 'NN'), ('79', 'CD'), ('inch', 'NN'), ('straight', 'RB'), ('carapace', 'NN'), ('length', 'JJ'), ('female', 'JJ'), ('lay', 'NN'), ('two', 'CD'), ('ten', 'JJ'), ('egg', 'NN'), ('average', 'JJ'), ('normally', 'RB'), ('late', 'JJ'), ('spring', 

[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!


NER TAGGED: {'Phinda Private Game Reserve South Africa', 'African', 'subSaharan Africa'}


In [16]:
t=token[0:7]
print(t)

['The', 'African', 'helmeted', 'turtle', 'Pelomedusa', 'subrufa', 'is']
