## Stopwords and Punctuation Removal

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopwords_eng = set(stopwords.words('english'))
txt = "What to do now ,it is free time"

In [None]:
tokens = word_tokenize(txt)
tokens = [token for token in tokens if token not in string.punctuation]
tokens

['What', 'to', 'do', 'now', 'it', 'is', 'free', 'time']

In [None]:
filtered_txt = " ".join([token for token in tokens if token not in stopwords_eng])
filtered_txt

'What free time'

## Stemmetization

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
txt = "Running runners run quickly towards the finishing line."

In [None]:
def stem_txt(text: str):
    tokens = text.lower().split()
    return " ".join([stemmer.stem(token) for token in tokens])

In [None]:
stem_txt(txt)

'run runner run quickli toward the finish line.'

## Lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
lemm = WordNetLemmatizer()
txt = "The leaves are falling from the trees, and the children are running happily."

In [None]:
def lemm_txt(text: str):
    tokens = word_tokenize(text)

    return " ".join([lemm.lemmatize(token) for token in tokens])

lemm_txt(txt)

'The leaf are falling from the tree , and the child are running happily .'

## Bag of Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [None]:
corpus = [
    "Hello world! This is a sample text.",
    "Bag of words model is simple.",
    "This is another example of text processing."
]

In [None]:
X = vectorizer.fit_transform(corpus)

print(f"Vocabulary: {vectorizer.get_feature_names_out()}")

Vocabulary: ['another' 'bag' 'example' 'hello' 'is' 'model' 'of' 'processing' 'sample'
 'simple' 'text' 'this' 'words' 'world']


In [None]:
print(X.toarray())

[[0 0 0 1 1 0 0 0 1 0 1 1 0 1]
 [0 1 0 0 1 1 1 0 0 1 0 0 1 0]
 [1 0 1 0 1 0 1 1 0 0 1 1 0 0]]


In [None]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 19 stored elements and shape (3, 14)>
  Coords	Values
  (0, 3)	1
  (0, 13)	1
  (0, 11)	1
  (0, 4)	1
  (0, 8)	1
  (0, 10)	1
  (1, 4)	1
  (1, 1)	1
  (1, 6)	1
  (1, 12)	1
  (1, 5)	1
  (1, 9)	1
  (2, 11)	1
  (2, 4)	1
  (2, 10)	1
  (2, 6)	1
  (2, 0)	1
  (2, 2)	1
  (2, 7)	1


## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [None]:
corpus = [
    "This is a sample document.",
    "This document is another example.",
    "We are learning TF-IDF in NLP."
]

In [None]:
X = vectorizer.fit_transform(corpus)

print(X.toarray())

[[0.         0.         0.45985353 0.         0.         0.
  0.45985353 0.         0.         0.60465213 0.         0.45985353
  0.        ]
 [0.51741994 0.         0.3935112  0.51741994 0.         0.
  0.3935112  0.         0.         0.         0.         0.3935112
  0.        ]
 [0.         0.37796447 0.         0.         0.37796447 0.37796447
  0.         0.37796447 0.37796447 0.         0.37796447 0.
  0.37796447]]


In [None]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (3, 13)>
  Coords	Values
  (0, 11)	0.4598535287588349
  (0, 6)	0.4598535287588349
  (0, 9)	0.6046521283053111
  (0, 2)	0.4598535287588349
  (1, 11)	0.39351120409397233
  (1, 6)	0.39351120409397233
  (1, 2)	0.39351120409397233
  (1, 0)	0.5174199439321682
  (1, 3)	0.5174199439321682
  (2, 12)	0.37796447300922725
  (2, 1)	0.37796447300922725
  (2, 7)	0.37796447300922725
  (2, 10)	0.37796447300922725
  (2, 4)	0.37796447300922725
  (2, 5)	0.37796447300922725
  (2, 8)	0.37796447300922725


In [None]:
vectorizer.get_feature_names_out()

array(['another', 'are', 'document', 'example', 'idf', 'in', 'is',
       'learning', 'nlp', 'sample', 'tf', 'this', 'we'], dtype=object)

## Sentiment Analysis

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
sentences = [
    "I absolutely love this product! It's amazing. 😊",
    "This is the worst experience I've ever had. Terrible service!",
    "The movie was okay, not great but not bad either.",
    "I'm not sure how I feel about this.",
    "What a fantastic experience! I'll definitely come back."
]

In [None]:
sia.polarity_scores(sentences[0])

{'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.862}

In [None]:
for sentence in sentences:
    score = sia.polarity_scores(sentence)
    print(f"{sentence} | Score: {score['compound']}")

    category = "Positive" if score['compound'] >= 0.05 else "Negative" if score['compound'] <= -0.05 else "Neutral"
    print(f"Category: {category}\n")

I absolutely love this product! It's amazing. 😊 | Score: 0.862
Category: Positive

This is the worst experience I've ever had. Terrible service! | Score: -0.8172
Category: Negative

The movie was okay, not great but not bad either. | Score: 0.4728
Category: Positive

I'm not sure how I feel about this. | Score: -0.2411
Category: Negative

What a fantastic experience! I'll definitely come back. | Score: 0.7644
Category: Positive



## Sentiment Model

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.metrics import accuracy_score, classification_report

from typing import Optional

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
data = {
    "text": [
        "I love this product! It's amazing.",
        "This is the worst experience ever.",
        "The food was okay, nothing special.",
        "I am extremely happy with the service!",
        "I hate this, it’s terrible.",
        "It was an average experience."
    ],
    "sentiment": ["positive", "negative", "neutral", "positive", "negative", "neutral"]
}

In [None]:
df = pd.DataFrame(data)
df

Unnamed: 0,text,sentiment
0,I love this product! It's amazing.,positive
1,This is the worst experience ever.,negative
2,"The food was okay, nothing special.",neutral
3,I am extremely happy with the service!,positive
4,"I hate this, it’s terrible.",negative
5,It was an average experience.,neutral


In [None]:
def preprocess_text(text: str):
    txt = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

df['processed_text'] = df['text'].apply(preprocess_text)
df

Unnamed: 0,text,sentiment,processed_text
0,I love this product! It's amazing.,positive,I love product Its amazing
1,This is the worst experience ever.,negative,This worst experience ever
2,"The food was okay, nothing special.",neutral,The food okay nothing special
3,I am extremely happy with the service!,positive,I extremely happy service
4,"I hate this, it’s terrible.",negative,I hate terrible
5,It was an average experience.,neutral,It average experience


In [None]:
le = LabelEncoder()
tfid = TfidfVectorizer()

X = tfid.fit_transform(df['processed_text'])
y = le.fit_transform(df['sentiment'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = MultinomialNB().fit(X, y)

y_pred = model.predict(X_test)

y_test, y_pred

(array([2, 0]), array([2, 0]))

In [None]:
def train(data: pd.DataFrame, split: Optional[float]=None):
    """
    Train a NB Model using Tf-IDF Vectorizer and Label Encoder.

    Returns:
    - model: Trained model
    - tfid: Tf-IDF Vectorizer
    - le: Label Encoder
    """
    le = LabelEncoder()
    tfid = TfidfVectorizer()
    model = MultinomialNB()

    X = tfid.fit_transform(data['processed_text'])
    y = le.fit_transform(data['sentiment'])

    if split:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)
        model.fit(X_train, y_train)

        return model, tfid, le

    model.fit(X, y)

    return model, tfid, le


def predict(text, model, tfid, le):
    preds = model.predict(tfid.transform(text))
    return le.inverse_transform(preds)

def metrics(X_test, y_test, model, tfid, le):
    y_pred = predict(X_test, model, tfid, le)
    y_pred = le.transform(y_pred)
    y_test = le.transform(y_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))

In [None]:
model, tfid, le = train(df)
model1, tfid1, le1 = train(df, split=0.2)

In [None]:
print(predict(df['processed_text'], model, tfid, le))
print(predict(df['processed_text'], model1, tfid1, le1))

metrics(df['processed_text'], df['sentiment'], model, tfid, le)
metrics(df['processed_text'], df['sentiment'], model1, tfid1, le1)

['positive' 'negative' 'neutral' 'positive' 'negative' 'neutral']
['neutral' 'neutral' 'neutral' 'positive' 'negative' 'neutral']
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         2

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      1.00      0.67         2
           2       1.00      0.50      0.67         2

    accuracy                           0.67         6
   macro avg       0.83      0.67      0.67         6
weighted avg       0.83      0.67      0.67         6



## Spacy Lemmatization

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def get_pos_lemm(text: str):
    doc = nlp(text)
    pos = [(token.text, token.lemma_, token.pos_) for token in doc]
    # pos = [(f"Original: {token.text}, Lemma: {token.lemma_}, POS: {token.pos_}") for token in doc]
    lemm_txt = " ".join([token.lemma_ for token in doc])

    return pos, lemm_txt

In [None]:
txt = "Running is my great habit"

pos, lemm_txt = get_pos_lemm(txt)

print(f"POS: {pos}")
print(f"Lemmatized text: {lemm_txt}")

POS: [('Running', 'run', 'VERB'), ('is', 'be', 'AUX'), ('my', 'my', 'PRON'), ('great', 'great', 'ADJ'), ('habit', 'habit', 'NOUN')]
Lemmatized text: run be my great habit
