<a href="https://colab.research.google.com/github/ra1111/Text-Analytics/blob/main/Text_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

text = "Text preprocessing involves removing special characters and stopwords."
preprocessed_text = preprocess_text(text)
print(preprocessed_text)


text preprocess involv remov special charact stopword


In [9]:
import nltk

text = "Text tokenization breaks down sentences into words for analysis."
tokens = nltk.word_tokenize(text)
print(tokens)


['Text', 'tokenization', 'breaks', 'down', 'sentences', 'into', 'words', 'for', 'analysis', '.']


In [13]:
import nltk

text = "Part-of-speech tagging categorizes words by their grammatical roles."
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)


[('Part-of-speech', 'JJ'), ('tagging', 'NN'), ('categorizes', 'NNS'), ('words', 'NNS'), ('by', 'IN'), ('their', 'PRP$'), ('grammatical', 'JJ'), ('roles', 'NNS'), ('.', '.')]


In [19]:
import nltk

text = "Apple Inc. was founded by Steve Jobs in Cupertino, California in 1976."
words = nltk.word_tokenize(text)
entities = nltk.ne_chunk(nltk.pos_tag(words))
print(entities)


(S
  (PERSON Apple/NNP)
  (ORGANIZATION Inc./NNP)
  was/VBD
  founded/VBN
  by/IN
  (PERSON Steve/NNP Jobs/NNP)
  in/IN
  (GPE Cupertino/NNP)
  ,/,
  (GPE California/NNP)
  in/IN
  1976/CD
  ./.)


In [18]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [20]:
from textblob import TextBlob

text = "I love this product! It's amazing."
analysis = TextBlob(text)
sentiment = analysis.sentiment.polarity

if sentiment > 0:
    print("Positive sentiment")
elif sentiment < 0:
    print("Negative sentiment")
else:
    print("Neutral sentiment")


Positive sentiment


In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

documents = ["Topic modeling is an important technique.",
             "It helps in uncovering hidden themes.",
             "These themes are present in a collection of texts."]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(X)

# Get the most representative keywords for each topic
def get_top_words_per_topic(model, feature_names, n_top_words):
    top_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words.append([feature_names[i] for i in top_words_idx])
    return top_words

n_top_words = 3
feature_names = vectorizer.get_feature_names_out()
top_words = get_top_words_per_topic(lda, feature_names, n_top_words)

for i, words in enumerate(top_words):
    print(f"Topic {i + 1}: {', '.join(words)}")


Topic 1: themes, in, are
Topic 2: technique, modeling, is
