**Tokenization**

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
text = ("Earth and the Moon are part of the universe, as are the other planets and their many dozens of moons. "
        "Along with asteroids and comets, the planets orbit the Sun. The Sun is one among hundreds of billions "
        "of stars in the Milky Way galaxy, and most of those stars have their own planets, known as exoplanets.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
print(sent_tokenize(text))

['Earth and the Moon are part of the universe, as are the other planets and their many dozens of moons.', 'Along with asteroids and comets, the planets orbit the Sun.', 'The Sun is one among hundreds of billions of stars in the Milky Way galaxy, and most of those stars have their own planets, known as exoplanets.']


In [None]:
print(word_tokenize(text))

['Earth', 'and', 'the', 'Moon', 'are', 'part', 'of', 'the', 'universe', ',', 'as', 'are', 'the', 'other', 'planets', 'and', 'their', 'many', 'dozens', 'of', 'moons', '.', 'Along', 'with', 'asteroids', 'and', 'comets', ',', 'the', 'planets', 'orbit', 'the', 'Sun', '.', 'The', 'Sun', 'is', 'one', 'among', 'hundreds', 'of', 'billions', 'of', 'stars', 'in', 'the', 'Milky', 'Way', 'galaxy', ',', 'and', 'most', 'of', 'those', 'stars', 'have', 'their', 'own', 'planets', ',', 'known', 'as', 'exoplanets', '.']


**Filtration**

In [None]:
import nltk
from nltk.corpus import stopwords
import re

# Download the English stopwords list if you haven't already
nltk.download('stopwords')

# Load the English stopwords list
english_stopwords = set(stopwords.words('english'))

# Your English text
text = ("Earth and the Moon are part of the universe, as are the other planets and their many dozens of moons. "
        "Along with asteroids and comets, the planets orbit the Sun. The Sun is one among hundreds of billions "
        "of stars in the Milky Way galaxy, and most of those stars have their own planets, known as exoplanets.")

# Tokenize the text and replace stopwords with 'W'
# Use regex to tokenize by splitting on non-word characters and convert to lowercase for uniform comparison
tokens = re.findall(r'\b\w+\b', text.lower())  # Tokenize the text
replaced_tokens = ['W' if token in english_stopwords else token for token in tokens]

# Join tokens back into a string
replaced_text = ' '.join(replaced_tokens)

print("Replaced Text:", replaced_text)


Replaced Text: earth W W moon W part W W universe W W W W planets W W many dozens W moons along W asteroids W comets W planets orbit W sun W sun W one among hundreds W billions W stars W W milky way galaxy W W W W stars W W W planets known W exoplanets


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Stemming**

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

stemmer = PorterStemmer()

text = ("Earth and the Moon are part of the universe, as are the other planets and their many dozens of moons. "
        "Along with asteroids and comets, the planets orbit the Sun. The Sun is one among hundreds of billions "
        "of stars in the Milky Way galaxy, and most of those stars have their own planets, known as exoplanets.")

words = word_tokenize(text)
stemmed_words = [stemmer.stem(word) for word in words]

# Join the stemmed words back into a single string
stemmed_text = ' '.join(stemmed_words)

# Print the original and stemmed text
print("Original Text:\n", text)
print("\nStemmed Text:\n", stemmed_text)

Original Text:
 Earth and the Moon are part of the universe, as are the other planets and their many dozens of moons. Along with asteroids and comets, the planets orbit the Sun. The Sun is one among hundreds of billions of stars in the Milky Way galaxy, and most of those stars have their own planets, known as exoplanets.

Stemmed Text:
 earth and the moon are part of the univers , as are the other planet and their mani dozen of moon . along with asteroid and comet , the planet orbit the sun . the sun is one among hundr of billion of star in the milki way galaxi , and most of those star have their own planet , known as exoplanet .


**Stopword** **Removal**

In [None]:
import nltk
from nltk.corpus import stopwords
import re

# Download the English stopwords list if you haven't already
nltk.download('stopwords')

# Load the English stopwords list
english_stopwords = set(stopwords.words('english'))

# Your English text
text = ("Earth and the Moon are part of the universe, as are the other planets and their many dozens of moons. "
        "Along with asteroids and comets, the planets orbit the Sun. The Sun is one among hundreds of billions "
        "of stars in the Milky Way galaxy, and most of those stars have their own planets, known as exoplanets.")

# Tokenize the text by splitting on non-word characters and convert to lowercase
tokens = re.findall(r'\b\w+\b', text.lower())

# Replace stopwords with 'W'
replaced_tokens = ['W' if token in english_stopwords else token for token in tokens]

# Join tokens back into a string
replaced_text = ' '.join(replaced_tokens)

print("Replaced Text:", replaced_text)


Replaced Text: earth W W moon W part W W universe W W W W planets W W many dozens W moons along W asteroids W comets W planets orbit W sun W sun W one among hundreds W billions W stars W W milky way galaxy W W W W stars W W W planets known W exoplanets


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!pip install stanza




**Tokenization**

In [None]:
import stanza

# Download the Hindi model
stanza.download('hi')

# Initialize the Hindi pipeline
nlp = stanza.Pipeline(lang='hi', processors='tokenize')

# Define Hindi text
hindi_text_short = "धरती और चाँद ब्रह्मांड का हिस्सा हैं। ऐस्टेरॉइड्स और धूमकेतु भी हैं। सूर्य मिल्की वे में कई तारों में से एक है।"

# Process the text
doc = nlp(hindi_text_short)

# Extract tokens
tokens = [word.text for sentence in doc.sentences for word in sentence.words]

# Print the tokens
print("Tokens:", tokens)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...
INFO:stanza:File exists: /root/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package |
-----------------------
| tokenize  | hdtb    |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Done loading processors!


Tokens: ['धरती', 'और', 'चाँद', 'ब्रह्मांड', 'का', 'हिस्सा', 'हैं', '।', 'ऐस्टेरॉइड्स', 'और', 'धूमकेतु', 'भी', 'हैं', '।', 'सूर्य', 'मिल्की', 'वे', 'में', 'कई', 'तारों', 'में', 'से', 'एक', 'है', '।']


In [None]:
!pip install stanza nltk snowballstemmer



**Stemming**

In [None]:
import stanza
from nltk.stem import SnowballStemmer
import nltk

# Download and initialize the stanza pipeline for tokenization
stanza.download('hi')  # Download Hindi models for other tasks
nlp = stanza.Pipeline(lang='hi', processors='tokenize')

# Define Hindi text
hindi_text = "धरती और चाँद ब्रह्मांड का हिस्सा हैं। ऐस्टेरॉइड्स और धूमकेतु भी हैं। सूर्य मिल्की वे में कई तारों में से एक है।"

# Process the text
doc = nlp(hindi_text)

# Extract tokens
tokens = [word.text for sentence in doc.sentences for word in sentence.words]

# Initialize the SnowballStemmer for Hindi (use 'english' for English language)
stemmer = SnowballStemmer("english")

# Apply stemming
stems = [stemmer.stem(token) for token in tokens]

# Print tokens and their stems
print("Original Tokens:", tokens)
print("Stemmed Tokens:", stems)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: hi (Hindi) ...
INFO:stanza:File exists: /root/stanza_resources/hi/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package |
-----------------------
| tokenize  | hdtb    |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Done loading processors!


Original Tokens: ['धरती', 'और', 'चाँद', 'ब्रह्मांड', 'का', 'हिस्सा', 'हैं', '।', 'ऐस्टेरॉइड्स', 'और', 'धूमकेतु', 'भी', 'हैं', '।', 'सूर्य', 'मिल्की', 'वे', 'में', 'कई', 'तारों', 'में', 'से', 'एक', 'है', '।']
Stemmed Tokens: ['धरती', 'और', 'चाँद', 'ब्रह्मांड', 'का', 'हिस्सा', 'हैं', '।', 'ऐस्टेरॉइड्स', 'और', 'धूमकेतु', 'भी', 'हैं', '।', 'सूर्य', 'मिल्की', 'वे', 'में', 'कई', 'तारों', 'में', 'से', 'एक', 'है', '।']


**Filtration**

In [None]:
import stanza
from nltk.corpus import stopwords
from collections import Counter
import nltk


nltk.download('stopwords')
hindi_stopwords = set([
    'और', 'का', 'है', 'हैं', 'के', 'भी', 'यह', 'को', 'से', 'हम', 'आप', 'पर', 'आ', 'में', 'कर', 'कि', 'उन', 'स', 'न', 'उस', 'तीन', 'मैं'
])

# Initialize the Stanza pipeline for Hindi
nlp = stanza.Pipeline('hi', processors='tokenize,pos,lemma')

# Your Hindi text
text = "धरती और चाँद ब्रह्मांड का हिस्सा हैं। ऐस्टेरॉइड्स और धूमकेतु भी हैं। सूर्य मिल्की वे में कई तारों में से एक है।"

# Process the text
doc = nlp(text)

# Extract tokens, remove stopwords
tokens = [word.text for sent in doc.sentences for word in sent.words]
filtered_tokens = [word for word in tokens if word not in hindi_stopwords]

# Count word frequencies
word_counts = Counter(filtered_tokens)

print("Filtered Tokens:", filtered_tokens)
print("Word Counts:", word_counts)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


Filtered Tokens: ['धरती', 'चाँद', 'ब्रह्मांड', 'हिस्सा', '।', 'ऐस्टेरॉइड्स', 'धूमकेतु', '।', 'सूर्य', 'मिल्की', 'वे', 'कई', 'तारों', 'एक', '।']
Word Counts: Counter({'।': 3, 'धरती': 1, 'चाँद': 1, 'ब्रह्मांड': 1, 'हिस्सा': 1, 'ऐस्टेरॉइड्स': 1, 'धूमकेतु': 1, 'सूर्य': 1, 'मिल्की': 1, 'वे': 1, 'कई': 1, 'तारों': 1, 'एक': 1})


**Stopword** **Removal**

In [None]:
import stanza
from nltk.corpus import stopwords
import nltk

# Download the Hindi stopwords list if you haven't already
nltk.download('stopwords')

# Define Hindi stopwords manually (or use a more comprehensive list if available)
hindi_stopwords = set([
    'और', 'का', 'है', 'हैं', 'के', 'भी', 'यह', 'को', 'से', 'हम', 'आप', 'पर', 'आ', 'में', 'कर', 'कि', 'उन', 'स', 'न', 'उस', 'तीन', 'मैं'
])

# Initialize the Stanza pipeline for Hindi
nlp = stanza.Pipeline('hi', processors='tokenize,pos,lemma')

# Your Hindi text
text = "धरती और चाँद ब्रह्मांड का हिस्सा हैं। ऐस्टेरॉइड्स और धूमकेतु भी हैं। सूर्य मिल्की वे में कई तारों में से एक है।"

# Process the text
doc = nlp(text)

# Extract tokens and replace stopwords with 'W'
replaced_text = ' '.join(['W' if word.text in hindi_stopwords else word.text for sent in doc.sentences for word in sent.words])

print("Replaced Text:", replaced_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: hi (Hindi):
| Processor | Package       |
-----------------------------
| tokenize  | hdtb          |
| pos       | hdtb_charlm   |
| lemma     | hdtb_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


Replaced Text: धरती W चाँद ब्रह्मांड W हिस्सा W । ऐस्टेरॉइड्स W धूमकेतु W W । सूर्य मिल्की वे W कई तारों W W एक W ।
