# Shivam Bomble
3MSAIM 2448510 NLP Lab 5

In [52]:
import nltk
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [2]:
def preprocess_text(text):
    nltk.download('stopwords')
    nltk.download('punkt')
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens


In [4]:
def cosine_sim(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

In [5]:
def jaccard_sim(doc1, doc2):
    set1, set2 = set(preprocess_text(doc1)), set(preprocess_text(doc2))
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

In [12]:
def euclidean_sim(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    return euclidean_distances(tfidf_matrix[0], tfidf_matrix[1])[0][0]

In [13]:
# Sample documents
doc1 = "Artificial intelligence is transforming industries worldwide."
doc2 = "AI is revolutionizing global industries with automation."

cos_sim = cosine_sim(doc1, doc2)
jac_sim = jaccard_sim(doc1, doc2)
euc_sim = euclidean_sim(doc1, doc2)

print(f"Cosine Similarity: {cos_sim:.4f}")
print(f"Jaccard Similarity: {jac_sim:.4f}")
print(f"Euclidean Distance: {euc_sim:.4f}")

Cosine Similarity: 0.1844
Jaccard Similarity: 0.1111
Euclidean Distance: 1.2772


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


1. Cosine Similarity is more effective for longer documents with meaningful term frequency variations.
2. Jaccard Similarity is best when exact word overlap matters.
3. Euclidean Distance is useful for measuring absolute differences, but it can be sensitive to document length.

# Q2

In [15]:
def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    return polarity, subjectivity

In [22]:
def analyze_sentiment_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    polarity = scores['compound']
    subjectivity = (scores['pos'] + scores['neg']) / 2  # Approximation
    return polarity, subjectivity

In [23]:
polarity1, subjectivity1 = analyze_sentiment(doc1)
polarity2, subjectivity2 = analyze_sentiment(doc2)

polarity1_vader, subjectivity1_vader = analyze_sentiment_vader(doc1)
polarity2_vader, subjectivity2_vader = analyze_sentiment_vader(doc2)

In [17]:
print(f"Document 1 - Polarity: {polarity1:.4f}, Subjectivity: {subjectivity1:.4f}")
print(f"Document 2 - Polarity: {polarity2:.4f}, Subjectivity: {subjectivity2:.4f}")

Document 1 - Polarity: -0.6000, Subjectivity: 1.0000
Document 2 - Polarity: 0.0000, Subjectivity: 0.0000


In [24]:
print(f"Document 1 - VADER Polarity: {polarity1_vader:.4f}, Subjectivity: {subjectivity1_vader:.4f}")
print(f"Document 2 - VADER Polarity: {polarity2_vader:.4f}, Subjectivity: {subjectivity2_vader:.4f}")

Document 1 - VADER Polarity: 0.4767, Subjectivity: 0.1915
Document 2 - VADER Polarity: 0.0000, Subjectivity: 0.0000


### **VADER vs TextBlob (Short Comparison)**

| Feature         | **VADER** | **TextBlob** |
|---------------|----------|-------------|
| **Approach** | Lexicon & rule-based | Statistical & ML-based |
| **Best for** | Short, social media texts (e.g., tweets) | General text, long documents |
| **Polarity Range** | -1 (negative) to +1 (positive) | -1 (negative) to +1 (positive) |
| **Subjectivity** | Not directly available (approximated) | 0 (objective) to 1 (subjective) |
| **Handles Emojis & Slang?** | Yes | No |
| **Context Awareness** | Limited, but considers intensifiers (e.g., "very good") | Less context-aware, relies on word-level analysis |
| **Speed** | Faster | Slower |

### **Which One to Use?**
- **Use VADER** for short, informal texts (tweets, reviews, chat messages).
- **Use TextBlob** for formal documents, news articles, or general sentiment analysis.

Since you're analyzing documents, **TextBlob is generally better**, but **VADER** can be useful if documents contain conversational or social media-style text.

# Q3

In [51]:
# --- Sentiment Analysis with Naïve Bayes ---
data = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv')
data = data[['tweet', 'label']]
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset to make it more complex
X_train, X_test, y_train, y_test = train_test_split(data['tweet'], data['label'], test_size=0.2, random_state=42)
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Naïve Bayes Accuracy: 0.9399343031440638
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      5962
           1       1.00      0.11      0.20       431

    accuracy                           0.94      6393
   macro avg       0.97      0.55      0.58      6393
weighted avg       0.94      0.94      0.92      6393



It loads a dataset containing tweets and their sentiment labels (0 = negative, 1 = positive).
The dataset is filtered to keep only the tweet text and label.

High accuracy (93.9%) suggests the model performs well overall.
Class imbalance problem:
Negative tweets (label 0) are well classified (94% precision, 100% recall).
Positive tweets (label 1) have poor recall (only 11%), meaning most positive tweets are misclassified.

The dataset is imbalanced (more negative tweets than positive ones).

# Q4

In [53]:
# --- Sentiment Analysis Using RNN ---
tokenizer = Tokenizer(num_words=20000)  # Increase vocabulary size for complexity
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=150)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=150)

y_train, y_test = np.array(y_train), np.array(y_test)
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=150),  # Larger embedding dimension
    SimpleRNN(128, return_sequences=False),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_seq, y_train, epochs=10, batch_size=64, validation_data=(X_test_seq, y_test))  # More epochs and batch size
print("RNN Model Evaluation:", model.evaluate(X_test_seq, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
RNN Model Evaluation: [0.2844448983669281, 0.9555764198303223]


RNN Model:
Embedding layer (128-dimensional word representations).
SimpleRNN layer (128 units).
Dense output layer (Sigmoid activation for binary classification).
Training: Runs for 10 epochs, with batch size 64.
Evaluation: Computes loss and accuracy.
Example Output:

Training accuracy: 99.9%
Validation accuracy: 95.1%
Model generalizes well but may be overfitting.