<a href="https://colab.research.google.com/github/Swayamg21/DAV_Experiments_17/blob/main/DAV_EXP_7_17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Aim : Perform the steps involved in Text Analytics in Python & R**

# **Text Analytics in Python**

**1. Tokenization (Sentence & Word)**


In [None]:
import nltk
nltk.download('punkt')

text = "This is a sample sentence. Tokenization is important for NLP."
sentences = nltk.sent_tokenize(text)
words = nltk.word_tokenize(text)

print("Sentences:", sentences)
print("Words:", words)


Sentences: ['This is a sample sentence.', 'Tokenization is important for NLP.']
Words: ['This', 'is', 'a', 'sample', 'sentence', '.', 'Tokenization', 'is', 'important', 'for', 'NLP', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**2. Frequency Distribution**

In [None]:
from nltk.probability import FreqDist

# Create a frequency distribution of the words in the text
freq_dist = FreqDist(words)

# Print the most common words and their frequencies
for word, frequency in freq_dist.most_common(5):
    print(f"{word}: {frequency}")


is: 2
.: 2
This: 1
a: 1
sample: 1


In [None]:
from nltk.probability import FreqDist

word_freq = FreqDist(words)
print("Word frequency:", word_freq)


Word frequency: <FreqDist with 10 samples and 12 outcomes>


**3. Remove stopwords & punctuations**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

text = "This is a sample sentence. Tokenization is important for NLP."

# Tokenize the text
words = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if word.lower() not in stop_words]

# Remove punctuations
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
words = [word for word in words if word not in punctuations]

print(words)


['sample', 'sentence', 'Tokenization', 'important', 'NLP']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**4. Lexicon Normalization (Stemming, Lemmatization)**

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in words]

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("Stemmed words:", stemmed_words)
print("Lemmatized words:", lemmatized_words)


Stemmed words: ['thi', 'is', 'a', 'sampl', 'sentenc', '.', 'token', 'is', 'import', 'for', 'nlp', '.']
Lemmatized words: ['This', 'is', 'a', 'sample', 'sentence', '.', 'Tokenization', 'is', 'important', 'for', 'NLP', '.']


**5. Part of Speech tagging**

In [None]:
tagged_words = nltk.pos_tag(words)
print("Part Of Speech Tagging:", tagged_words)


Part Of Speech Tagging: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('sentence', 'NN'), ('.', '.')]


**6. Named Entity Recognization**

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

text = "Barack Obama was born in Honolulu, Hawaii."
words = word_tokenize(text)
tagged_words = nltk.pos_tag(words)

ne_tree = nltk.ne_chunk(tagged_words)
print(ne_tree)


(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Honolulu/NNP)
  ,/,
  (GPE Hawaii/NNP)
  ./.)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


**7. Scrape data from a website**

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

text_data = soup.get_text()
print("Text data from website:", text_data)


Text data from website: 


Example Domain







Example Domain
This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.
More information...






# **Text Analytics in R**

**1. Tokenization (Sentence & Word)**


In [None]:
# Tokenization (Sentence & Word)
text <- "This is a sample sentence. Tokenization is important for NLP."
sentences <- strsplit(text, "\\.")[[1]]
words <- unlist(strsplit(text, "\\s+"))

print("Sentences:")
print(sentences)
print("Words:")
print(words)


[1] "Sentences:"
[1] "This is a sample sentence"          " Tokenization is important for NLP"
[1] "Words:"
 [1] "This"         "is"           "a"            "sample"       "sentence."   
 [6] "Tokenization" "is"           "important"    "for"          "NLP."        


In [None]:
install.packages("tokenizers")
library(tokenizers)

text <- "This is a sample sentence. Tokenization is important for NLP."
sentences <- tokenize_sentences(text)
words <- tokenize_words(text)

print("Sentences:")
print(sentences)
print("Words:")
print(words)


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘Rcpp’, ‘SnowballC’




[1] "Sentences:"
[[1]]
[1] "This is a sample sentence."         "Tokenization is important for NLP."

[1] "Words:"
[[1]]
 [1] "this"         "is"           "a"            "sample"       "sentence"    
 [6] "tokenization" "is"           "important"    "for"          "nlp"         



**2. Frequency Distribution**

In [None]:
# Frequency Distribution
word_freq <- table(words)
print("Word frequency:")
print(word_freq)

[1] "Word frequency:"
words
           a          for    important           is         NLP.       sample 
           1            1            1            2            1            1 
   sentence.         This Tokenization 
           1            1            1 


**3. Remove stopwords & punctuations**

In [None]:
# Remove stopwords & punctuations
stop_words <- c("is", "a", "for")  # Example list of stopwords
filtered_words <- words[!tolower(words) %in% stop_words & !grepl("[[:punct:]]", words)]
print("Filtered words:")
 print(filtered_words)

[1] "Filtered words:"
[1] "This"         "sample"       "Tokenization" "important"   


**4. Lexicon Normalization (Stemming, Lemmatization)**

In [None]:
# Lexicon Normalization (Stemming, Lemmatization)
# For Stemming
stemmed_words <- wordStem(filtered_words, language = "en")

# For Lemmatization (using example)
install.packages("udpipe")
library(udpipe)
ud_model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(ud_model$file_model)
x <- udpipe_annotate(ud_model, texts = filtered_words, doc_id = 1:length(filtered_words))
lemmatized_words <- as.data.frame(x)$lemma

print("Stemmed words:")
print(stemmed_words)
print("Lemmatized words:")
print(lemmatized_words)


**5. Part of Speech tagging**

In [None]:
# Part of Speech tagging
# Using udpipe library
x <- udpipe_annotate(ud_model, texts = filtered_words, doc_id = 1:length(filtered_words))
pos_tags <- as.data.frame(x)$upos

print("POS tagging:")
print(pos_tags)


**6. Named Entity Recognization**

In [None]:
install.packages("NLP")
install.packages("openNLP")
library(openNLP)
library(NLP)

ner_tags <- maxent_tagger_chunker(filtered_words, pos_tags)
print("Named Entities:")
print( ner_tags)

**7. Scrape data from a website**

In [None]:
install.packages("rvest")
library(rvest)

url <- "https://example.com"
page <- read_html(url)
text_data <- page %>%
  html_text()

print("Text data from website:")
print(text_data)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



[1] "Text data from website:"
[1] "Example Domain\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, \"Segoe UI\", \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    \n\n    Example Domain\n    This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\n    More information...\n\n"


# **Outcome :**
1. Identified the Text Analytics Libraries in Python and R
2. Performed simple experiments with these libraries in Python and R