<a href="https://colab.research.google.com/github/sora11023539/cctb/blob/main/Text_Mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Analysis

In [2]:
pip install nltk



In [3]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [4]:
# Tokenization
text = "This is a Big Data course in CCTB."
tokens = nltk.word_tokenize(text)
print(tokens)

['This', 'is', 'a', 'Big', 'Data', 'course', 'in', 'CCTB', '.']


In [5]:
paragraph_text = "this is a big dta course in cctb. this is our second semester. we are loving it"
sent_tokens = nltk.sent_tokenize(paragraph_text)
print(sent_tokens)

['this is a big dta course in cctb.', 'this is our second semester.', 'we are loving it']


In [None]:
# Counter Tokens
from collections import Counter

word_counts = Counter(tokens)
print(word_counts)

Counter({'This': 1, 'is': 1, 'a': 1, 'Big': 1, 'Data': 1, 'course': 1, 'in': 1, 'CCTB': 1, '.': 1})


In [None]:
# StopWords (The, an , or ,etc.)

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)

['Big', 'Data', 'course', 'CCTB', '.']


In [None]:
# Stemming and Lemitizing (Back to root form)

from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print(stemmed_words)

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print(lemmatized_words)

['thi', 'is', 'a', 'big', 'data', 'cours', 'in', 'cctb', '.']
['This', 'is', 'a', 'Big', 'Data', 'course', 'in', 'CCTB', '.']


In [None]:
# Sentiment Analysis

from nltk.sentiment import SentimentIntensityAnalyzer

text = "I love this course! This is very exciting and amazing!"
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
print(sentiment_scores)

{'neg': 0.0, 'neu': 0.327, 'pos': 0.673, 'compound': 0.9237}


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

text = "I Hate this course! Its very hard and confusing"
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
print(sentiment_scores)

{'neg': 0.611, 'neu': 0.389, 'pos': 0.0, 'compound': -0.7813}


In [None]:
# Text Classification

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Training data
documents = [
    ("I love this course", "positive"),
    ("I hate this program", "negative"),
    ("This was an awesome movie", "positive"),
    ("The course was terrible", "negative")
]

# Prepare features and labels
vectorizer = CountVectorizer()
features = vectorizer.fit_transform([doc[0] for doc in documents])
labels = [doc[1] for doc in documents]

# Train a classifier (Naive Bayes)
classifier = MultinomialNB()
classifier.fit(features, labels)

# Test with a new example
new_example = vectorizer.transform(["I really enjoyed watching this film"])
prediction = classifier.predict(new_example)
print(prediction)

['positive']


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def process_paragraph(paragraph):
    sentences = nltk.sent_tokenize(paragraph)
    words = []
    for sentence in sentences:
        words.extend(nltk.word_tokenize(sentence))

    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]

    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    return sentences, filtered_words, stemmed_words, lemmatized_words

paragraph = "This is a sample paragraph. It contains multiple sentences. We will process it."
sentences, filtered_words, stemmed_words, lemmatized_words = process_paragraph(paragraph)

print("Sentences:", sentences)
print("Words after removing stop words:", filtered_words)
print("Stemmed words:", stemmed_words)
print("Lemmatized words:", lemmatized_words)


Sentences: ['This is a sample paragraph.', 'It contains multiple sentences.', 'We will process it.']
Words after removing stop words: ['sample', 'paragraph', '.', 'contains', 'multiple', 'sentences', '.', 'process', '.']
Stemmed words: ['sampl', 'paragraph', '.', 'contain', 'multipl', 'sentenc', '.', 'process', '.']
Lemmatized words: ['sample', 'paragraph', '.', 'contains', 'multiple', 'sentence', '.', 'process', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
