<a href="https://colab.research.google.com/github/VaishaliChauhanCCTB/Demo/blob/main/Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Analysis

In [None]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Tokenization
import nltk

text = "This is a Big Data course in CCTB."
tokens = nltk.word_tokenize(text)
print(tokens)

['This', 'is', 'a', 'Big', 'Data', 'course', 'in', 'CCTB', '.']


In [None]:
# Counter Tokens
from collections import Counter

word_counts = Counter(tokens)
print(word_counts)

Counter({'This': 1, 'is': 1, 'a': 1, 'Big': 1, 'Data': 1, 'course': 1, 'in': 1, 'CCTB': 1, '.': 1})


In [None]:
# StopWords (The, an , or ,etc.)

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)

['Big', 'Data', 'course', 'CCTB', '.']


In [None]:
# Stemming and Lemitizing (Back to root form)

from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print(stemmed_words)

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print(lemmatized_words)

['thi', 'is', 'a', 'big', 'data', 'cours', 'in', 'cctb', '.']
['This', 'is', 'a', 'Big', 'Data', 'course', 'in', 'CCTB', '.']


In [None]:
# Sentiment Analysis

from nltk.sentiment import SentimentIntensityAnalyzer

text = "I love this course! This is very exciting and amazing!"
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
print(sentiment_scores)

{'neg': 0.0, 'neu': 0.327, 'pos': 0.673, 'compound': 0.9237}


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

text = "I Hate this course! Its very hard and confusing"
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
print(sentiment_scores)

{'neg': 0.611, 'neu': 0.389, 'pos': 0.0, 'compound': -0.7813}


In [None]:
# Text Classification

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Training data
documents = [
    ("I love this course", "positive"),
    ("I hate this program", "negative"),
    ("This was an awesome movie", "positive"),
    ("The course was terrible", "negative")
]

# Prepare features and labels
vectorizer = CountVectorizer()
features = vectorizer.fit_transform([doc[0] for doc in documents])
labels = [doc[1] for doc in documents]

# Train a classifier (Naive Bayes)
classifier = MultinomialNB()
classifier.fit(features, labels)

# Test with a new example
new_example = vectorizer.transform(["I really enjoyed watching this film"])
prediction = classifier.predict(new_example)
print(prediction)

['positive']
