In [1]:
# Load the tokenized data from files

import pickle
import os
import math
from collections import Counter
from collections import defaultdict, Counter


data_folder = "C:/Users/Sanika Salunke/Desktop/Applied_ML_Assigmnt/Tokenizer_Result"

train_data = pickle.load(open(os.path.join(data_folder, "train_data.pkl"), "rb"))
test_data = pickle.load(open(os.path.join(data_folder, "test_data.pkl"), "rb"))
train_labels = pickle.load(open(os.path.join(data_folder, "train_labels.pkl"), "rb"))
test_labels = pickle.load(open(os.path.join(data_folder, "test_labels.pkl"), "rb"))
vocab = pickle.load(open(os.path.join(data_folder, "vocab.pkl"), "rb"))


print("Example from training data:", train_data[0] if train_data else "No training data found")
print("Example from testing data:", test_data[0] if test_data else "No testing data found")
print("Number of words in vocabulary:", len(vocab))

Example from training data: ['s', 'pretty', 'good', 'cast', 'film', 'has', 'nowhere', 'near', 'grace', 'original', 'italian', 'comedy', 'big', 'deal', 'madonna', 'street', 'anyone', 'looking', 'entertaining', 'caper', 'film', 'should', 'visit', 'original', 'william', 'macy', 'may', 'one', 'our', 'greatest', 'living', 'actors', 'here', 'he', 's', 'put', 'little', 'use', 'his', 'role', 'original', 'played', 'marcello', 'mastroianni', 'i', 'sort', 'feel', 'sorry', 'him', 'trying', 'fill', 'those', 'shoes', 'might', 'well', 'try', 'imitate', 'bogart', 'young', 'de', 'niro', 'art', 'direction', 'rich', 'textured', 'brings', 'nothing', 'story', 'extra', 'bits', 'add', 'story', 'feel', 'completely', 'unnecessary', 'things', 'take', 'away', 'missed', 'even', 'starting', 'way', 'do', 'seems', 'bizarrely', 'gratuitous', 'takes', 'away', 'surprise', 'original', 'sam', 'rockwell', 'has', 'his', 'odd', 'genial', 'charm', 'luis', 'guzman', 'has', 'odd', 'charisma', 'love', 'story', 'part', 'movie', 

In [2]:

# loading bigram only data
train_data_bigrams = pickle.load(open(os.path.join(data_folder, "train_data_bigrams.pkl"), "rb"))
test_data_bigrams = pickle.load(open(os.path.join(data_folder, "test_data_bigrams.pkl"), "rb"))
vocab_bigrams = pickle.load(open(os.path.join(data_folder, "vocab_bigrams.pkl"), "rb"))


print("Train data (bigrams) sample:", train_data_bigrams[0] if train_data_bigrams else "No data found")
print("Test data (bigrams) sample:", test_data_bigrams[0] if test_data_bigrams else "No data found")
print("Bigram vocabulary size:", len(vocab_bigrams))

Train data (bigrams) sample: ['s_pretty', 'pretty_good', 'good_cast', 'cast_film', 'film_has', 'has_nowhere', 'nowhere_near', 'near_grace', 'grace_original', 'original_italian', 'italian_comedy', 'comedy_big', 'big_deal', 'deal_madonna', 'madonna_street', 'street_anyone', 'anyone_looking', 'looking_entertaining', 'entertaining_caper', 'caper_film', 'film_should', 'should_visit', 'visit_original', 'original_william', 'william_macy', 'macy_may', 'may_one', 'one_our', 'our_greatest', 'greatest_living', 'living_actors', 'actors_here', 'here_he', 'he_s', 's_put', 'put_little', 'little_use', 'use_his', 'his_role', 'role_original', 'original_played', 'played_marcello', 'marcello_mastroianni', 'mastroianni_i', 'i_sort', 'sort_feel', 'feel_sorry', 'sorry_him', 'him_trying', 'trying_fill', 'fill_those', 'those_shoes', 'shoes_might', 'might_well', 'well_try', 'try_imitate', 'imitate_bogart', 'bogart_young', 'young_de', 'de_niro', 'niro_art', 'art_direction', 'direction_rich', 'rich_textured', 'te

## Task 2: Feature Extraction with TF-IDF 

In [3]:

# Additional feature selection  
min_percent = 0.05
total_docs = len(train_data)
min_count = math.ceil(min_percent * total_docs)

doc_count = Counter()
for review in train_data:
    doc_count.update(set(review))

selected_tokens = {token for token, count in doc_count.items() if count >= min_count}
print("Before filtering token count:", len(doc_count))
print("After filtering token count:", len(selected_tokens))

idf = {}
for token in selected_tokens:
    df = doc_count[token]
    idf[token] = math.log((total_docs + 1) / (df + 1)) + 1

# Manual Implementation of feature extraction using tf - idf values

def calcTfidfval(review, selected_tokens, idf):
    token_counts = Counter(token for token in review if token in selected_tokens)
    tfidf = {}
    total_tokens = sum(token_counts.values())
    if total_tokens == 0:
        return tfidf
    for token, count in token_counts.items():
        tf = count / total_tokens
        tfidf[token] = tf * idf[token]
    return tfidf

trainedTfidval =  [calcTfidfval(review, selected_tokens, idf) for review in train_data]
testTfidfval =  [calcTfidfval(review, selected_tokens, idf) for review in test_data]

print("First training review tf idf values:")
print(trainedTfidval[0])

Before filtering token count: 5251541
After filtering token count: 434
First training review tf idf values:
{'s': 0.06140163146933402, 'pretty': 0.03664876979641259, 'good': 0.022769708548521807, 'cast': 0.035427248049907954, 'film': 0.0368357728803567, 'has': 0.08883031465006824, 'original': 0.15666686677020691, 'comedy': 0.03910232343290857, 'big': 0.03705580844986601, 'anyone': 0.03921150646023228, 'looking': 0.04010779851785525, 'entertaining': 0.04578492284453531, 'should': 0.03267873086062223, 'may': 0.03756817123217647, 'one': 0.018329820990682647, 'our': 0.04187245497684678, 'actors': 0.03410458647785022, 'here': 0.03195140536344031, 'he': 0.02187166961903355, 'put': 0.040221453621690335, 'little': 0.030706537228766137, 'use': 0.043797292210014035, 'his': 0.043062028319731493, 'role': 0.03859962573167883, 'played': 0.0403692332000237, 'i': 0.014276676314789703, 'sort': 0.04623220517974224, 'feel': 0.11586990902762337, 'him': 0.030055643012136625, 'trying': 0.04021058133761002, 

In [4]:
# Computing TF-IDF features for bigrams


doc_freq_bigrams = Counter()
for review in train_data_bigrams:
    doc_freq_bigrams.update(set(review))


selected_bigrams = {bigram for bigram, count in doc_freq_bigrams.items() if count >= min_count}
print("Number of bigrams before filtering:", len(doc_freq_bigrams))
print("Number of bigrams after filtering:", len(selected_bigrams))


idf_bigrams = {}
for bigram in selected_bigrams:
    df = doc_freq_bigrams[bigram]
    idf_bigrams[bigram] = math.log((total_docs + 1) / (df + 1)) + 1


train_tfidf_bigrams = [calcTfidfval(review, selected_bigrams, idf_bigrams) for review in train_data_bigrams]
test_tfidf_bigrams = [calcTfidfval(review, selected_bigrams, idf_bigrams) for review in test_data_bigrams]

print("\nSample TF-IDF features for the first training review (bigrams):")
print(train_tfidf_bigrams[0])

Number of bigrams before filtering: 1805104
Number of bigrams after filtering: 30

Sample TF-IDF features for the first training review (bigrams):
{'he_s': 3.5010760309179054}


## Task 3: Bayesian Classification - Theoretical Justification  

Bayesian Theorem is one of the most renowned and fundamental concepts in statistics. It is based on calculating the probabilities (posterior and prior) and is used to describe how we can modify our belief based on new situations. 
Mathematically it is expressed as:
	P(C∣X) =P(X)P(X∣C) P(C) 
where:
•	P(C∣X) P(C|X) P(C∣X) is the posterior probability, which is the probability of class C given the observed data X.
•	P(X∣C) P(X|C) P(X∣C) is the likelihood, which is the probability of observing X given that class C is true.
•	P(C)P(C)P(C) is the prior probability, which is our initial belief about class C before seeing any data.
•	P(X)P(X)P(X) is the marginal probability of the observed data X across all possible classes.
The Naïve Bayes assumption states that the features (tokens w.r.t text classification) are conditionally independent given the class label. That is:
P(X∣C)=P(x1,x2,...,xn∣ C)=P(x1∣C)P(x2∣C)...P(xn ∣C)P(X | C) = P(x_1, x_2, ..., x_n | C) = P(x_1 | C) P(x_2 | C) ... P(x_n | C)P(X∣C)=P(x1,x2,...,xn∣C)=P(x1∣C)P(x2∣C)...P(xn∣C) 

But this assumption helps to simplify the computation making naïve bayes a better model for high dimensional data. Still, it is unrealistic since words in text data are contextually related.

1.	Prior Probability:
P(C)=count of documents in class C / total number of documents 
2.	Likelihood with Laplace Smoothing:
Since some tokens may not appear in each class, Laplace smoothing ensures that probabilities never become zero. The likelihood of a word w in class C is:
P(w∣C) = count of w in documents of class C+1 / total word count in class C+∣V∣ 
where ∣V∣ is the vocabulary size.
3.	Final Classification Formula:
Using Bayes’ rule and the Naïve Bayes assumption, we classify a new document based on:
 
To avoid numerical underflow, we take the logarithm of probabilities:
		 



## Task 4: Implementing Bayesian Classification 

In [5]:



def train_bayes_classifier(data_tfidf, data_labels):
    #Getting all unique classes
    classes = set(data_labels)
    class_counts = Counter(data_labels)
    total_labels = len(data_labels)
    
  # Calculating prior prob for each classes
    priors = {c: class_counts[c] / total_labels for c in classes}
    
    #initializing token sums and total weights for each classes
    token_sums = {c: defaultdict(float) for c in classes}
    total_weights = {c: 0.0 for c in classes}
    
    # Summing up the tfidf weights for each token in each class
    for tfidf, label in zip(data_tfidf, data_labels):
        for token, weight in tfidf.items():
            token_sums[label][token] += weight
            total_weights[label] += weight
    
    # Calculating token probabilities for each class
    vocab_size = len(selected_tokens)
    # Laplace Smoothing to avoid zero prob scenarios
    token_probs = {c: {} for c in classes}
    for c in classes:
        for token in selected_tokens:
            token_sum = token_sums[c].get(token, 0.0)
            token_probs[c][token] = (token_sum + 1) / (total_weights[c] + vocab_size)
    
   
    def predict(review_tfidf):
        scores = {}
        #Using logarithms for numerical stability
        for c in classes:
            score = math.log(priors[c])  
            for token, weight in review_tfidf.items():
                if token in selected_tokens:
                    score += weight * math.log(token_probs[c][token])
                else:
                    score += weight * math.log(1 / (total_weights[c] + vocab_size))
            scores[c] = score
        return max(scores, key=scores.get) 
    
    return predict

# Training the classifier
clf_baseline = train_bayes_classifier(trainedTfidval, train_labels)

# Making predictions for training and testing data
train_preds = [clf_baseline(x) for x in trainedTfidval]
test_preds = [clf_baseline(x) for x in testTfidfval]

# calculating accuracy
train_acc = sum(1 for p, t in zip(train_preds, train_labels) if p == t) / len(train_labels)
test_acc = sum(1 for p, t in zip(test_preds, test_labels) if p == t) / len(test_labels)

print("Baseline Train Accuracy:", train_acc)
print("Baseline Test Accuracy:", test_acc)


# Count TP, FP, and FN for the "positive" class
tp = sum(1 for p, t in zip(test_preds, test_labels) if p == "positive" and t == "positive")
fp = sum(1 for p, t in zip(test_preds, test_labels) if p == "positive" and t == "negative")
fn = sum(1 for p, t in zip(test_preds, test_labels) if p == "negative" and t == "positive")

# Calculate precision and recall
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

Baseline Train Accuracy: 0.80184
Baseline Test Accuracy: 0.8003520140805632


In [6]:
# Training the classifier for bigram data

clf_bigrams = train_bayes_classifier(train_tfidf_bigrams, train_labels)

# making predictions for bigram only model
train_preds_bigrams = [clf_bigrams(x) for x in train_tfidf_bigrams]
test_preds_bigrams = [clf_bigrams(x) for x in test_tfidf_bigrams]

# calculating accuracy for bigram only model
train_acc_bigrams = sum(1 for p, t in zip(train_preds_bigrams, train_labels) if p == t) / len(train_labels)
test_acc_bigrams = sum(1 for p, t in zip(test_preds_bigrams, test_labels) if p == t) / len(test_labels)

print("Bigram Model Train Accuracy:", train_acc_bigrams)
print("Bigram Model Test Accuracy:", test_acc_bigrams)

Bigram Model Train Accuracy: 0.58724
Bigram Model Test Accuracy: 0.5849033961358454


## Task 5: Handling Class Imbalances & Evaluating Performance 

In [7]:
import random

# Since the model 1 which is trained on all 3 types ok tokens is performing better than the bigram model, that model is used for further analysis



def undersample(data_tfidf, data_labels):
    counts = Counter(data_labels)
    min_count = min(counts.values())
    indices_by_class = {label: [] for label in counts}
    for i, label in enumerate(data_labels):
        indices_by_class[label].append(i)
    new_indices = []
    for label in counts:
        new_indices.extend(random.sample(indices_by_class[label], min_count))
    new_indices.sort()
    return [data_tfidf[i] for i in new_indices], [data_labels[i] for i in new_indices]

# K - fold cross validation using 5 folds

k = 5
indices = list(range(len(trainedTfidval)))
random.shuffle(indices)
fold_size = len(indices) // k
fold_accuracies = []

for i in range(k):
    val_indices = indices[i * fold_size: (i + 1) * fold_size] if i < k - 1 else indices[i * fold_size:]
    train_indices = [j for j in indices if j not in val_indices]
    fold_trainedTfidval = [trainedTfidval[j] for j in train_indices]
    fold_train_labels = [train_labels[j] for j in train_indices]
    fold_trainedTfidval_us, fold_train_labels_us = undersample(fold_trainedTfidval, fold_train_labels)
    clf = train_bayes_classifier(fold_trainedTfidval_us, fold_train_labels_us)
    val_preds = [clf(trainedTfidval[j]) for j in val_indices]
    fold_true = [train_labels[j] for j in val_indices]
    acc = sum(1 for p, t in zip(val_preds, fold_true) if p == t) / len(val_preds)
    fold_accuracies.append(acc)

print("5-Fold CV Average Accuracy:", sum(fold_accuracies) / len(fold_accuracies))

# Under Sampling 
trainedTfidval_us_full, train_labels_us_full = undersample(trainedTfidval, train_labels)
clf_final = train_bayes_classifier(trainedTfidval_us_full, train_labels_us_full)
test_preds_final = [clf_final(x) for x in testTfidfval]

# Confusion Matrix
confusion = {}
for true, pred in zip(test_labels, test_preds_final):
    confusion.setdefault(true, {})
    confusion[true][pred] = confusion[true].get(pred, 0) + 1
print("Confusion Matrix:", confusion)

# Precision, Recall, F1-Score, Accuracy
precision = {}
recall = {}
f1 = {}
all_classes = set(train_labels)
for c in all_classes:
    tp = confusion.get(c, {}).get(c, 0)
    fp = sum(confusion.get(other, {}).get(c, 0) for other in all_classes if other != c)
    fn = sum(confusion.get(c, {}).get(other, 0) for other in all_classes if other != c)
    precision[c] = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall[c] = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1[c] = 2 * precision[c] * recall[c] / (precision[c] + recall[c]) if (precision[c] + recall[c]) > 0 else 0

accuracy_final = sum(1 for p, t in zip(test_preds_final, test_labels) if p == t) / len(test_labels)
print("Test Accuracy:", accuracy_final)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

5-Fold CV Average Accuracy: 0.79928
Confusion Matrix: {'negative': {'negative': 10081, 'positive': 2419}, 'positive': {'positive': 9927, 'negative': 2572}}
Test Accuracy: 0.8003520140805632
Precision: {'positive': 0.8040660942815486, 'negative': 0.7967280486841065}
Recall: {'positive': 0.7942235378830307, 'negative': 0.80648}
F1-Score: {'positive': 0.7991145099617629, 'negative': 0.8015743648868922}


## Task 6: Analysis of Errors & Manual Evaluation 

In [8]:
from Tokenizer import preprocess_text
# preprocess_text is defined in Tokenizer.py

# Find 5 misclassified reviews
misclassified = []
for i, (pred, true) in enumerate(zip(test_preds_final, test_labels)):
    if pred != true:
        misclassified.append((i, pred, true, test_data[i]))
    if len(misclassified) >= 5:
        break

print("5 Misclassified Reviews (index, predicted, true, review):")
for idx, pred, true, tokens in misclassified:
    print("Index:", idx, "| Predicted:", pred, "| True:", true, "| Review:", " ".join(tokens))

# Testing  two new movie reviews
new_reviews = [
    "I liked this movie. It was very enetertaining.",
    "I didn't enjoy this movie, it was terrible. The plot made no sense and the acting was wooden."
]

# Preprocess and compute TF-IDF for the new reviews
new_reviews_tokens = [preprocess_text(review, vocab=vocab) for review in new_reviews]
new_reviews_tfidf = [calcTfidfval(tokens, selected_tokens, idf) for tokens in new_reviews_tokens]

print("\nNew Reviews Predictions:")
for review, tfidf in zip(new_reviews, new_reviews_tfidf):
    pred = clf_final(tfidf)
    print("Review:", review)
    print("Predicted Sentiment:", pred)

# Comparinhg performance with different TF-IDF thresholds
def calcTfidfval_features(min_doc_freq_percent):
    num_docs = len(train_data)
    min_count = math.ceil(min_doc_freq_percent * num_docs)
    df_counter = Counter()
    for review in train_data:
        df_counter.update(set(review))
    sel_tokens_new = {token for token, count in df_counter.items() if count >= min_count}
    idf_new = {}
    for token in sel_tokens_new:
        idf_new[token] = math.log((num_docs + 1) / (df_counter[token] + 1)) + 1
    train_tfidf_new = [calcTfidfval(review, sel_tokens_new, idf_new) for review in train_data]
    test_tfidf_new = [calcTfidfval(review, sel_tokens_new, idf_new) for review in test_data]
    return sel_tokens_new, idf_new, train_tfidf_new, test_tfidf_new

for thresh in [0.05, 0.1]:
    sel_tokens_new, idf_new, train_tfidf_new, test_tfidf_new = calcTfidfval_features(thresh)
    train_tfidf_us_new, train_labels_us_new = undersample(train_tfidf_new, train_labels)
    clf_new = train_bayes_classifier(train_tfidf_us_new, train_labels_us_new)
    test_preds_new = [clf_new(x) for x in test_tfidf_new]
    acc_new = sum(1 for p, t in zip(test_preds_new, test_labels) if p == t) / len(test_labels)
    print("\nTF-IDF min_doc_freq_percent =", thresh, "-> Test Accuracy:", acc_new)

All done! Processed data saved in Tokenizer_Result.
Bigram data saved too. You're good to go!
5 Misclassified Reviews (index, predicted, true, review):
Index: 1 | Predicted: positive | True: negative | Review: do not_take film seriously rent some folks who want play mystery science 3000 will probably laugh butts off evil guys not_scary s funny s like some dude 7th grade sickle scarecrow get up acting hilarious i love occasional self torture poor horror film really had me giggling i recommend basis course recreational drugs will enhance experience oh there naked group swimming scene will allow some star dust 5 star system token black male gets injured badly continues his joking well using injured body part quite readily throughout enjoy complete utter disgrace films do_not_take not_take_film film_seriously seriously_rent rent_some some_folks folks_who who_want want_play play_mystery mystery_science science_3000 3000_will will_probably probably_laugh laugh_butts butts_off off_evil evil_g

The proposed work first evaluated the predicted sentiment labels against the true actual labels in the test data set. If the prediction did not match the true label, then the corresponding review was marked as misclassified. This process was stopped after identifying 5 misclassified reviews for manual reviews. These misclassified reviews were printed along with their true and predicted sentiment labels. They were analyzed to detect any patterns in the errors (e.g. sarcasm, ambiguous language or unseen vocabulary). Two new movie reviews were manually written one positive, and one negative, and were processed using same TF-IDF feature extraction as the test data. The trained classifier predicted the sentiment for these new reviews. The model’s performance was analyzed under different TF-IDF thresholds for minimum document frequency (0.05 and 0.1). For each threshold new feature sets were calculated, and a classifier was trained using the under sampled data. A lower TF-IDF threshold (0.05) retained more words and achieved a test accuracy of 80.03 %. This captured finer details and potentially more noise too, leading to overfitting.  The higher threshold model (0.1) achieved 73.72 % test accuracy. Fewer features in this case led to loss of information. This improved generalization but decreased accuracy. With manual evaluation the study found out where the models could be improved such as handling sarcasm, negation, long tailed words, and domain specific references.

## Task 7 : Implement an Improved Feature Selection Method 

In [9]:
# Improved feature selection by removing highly frequently occuring words

# defing the minimum and maximum percentage of doc a word should appear in
min_percent = 0.05  # min 5 %
max_percent = 0.8   # max 80 %

# calculating the minimum and maximum number of doc a word should appear in
total_docs = len(train_data)
min_count = math.ceil(min_percent * total_docs)
max_count = math.floor(max_percent * total_docs)


word_doc_count = Counter()
for review in train_data:
    word_doc_count.update(set(review))  

# Filter out words that don't meet the doc frequency threshold
filtered_words = {word for word, count in word_doc_count.items() if min_count <= count <= max_count}

print("Tokens before filtering:", len(word_doc_count))
print("Tokens after filtering:", len(filtered_words))

# Calculating the IDF 
idf_values = {}
for word in filtered_words:
    doc_count = word_doc_count[word]
    idf_values[word] = math.log((total_docs + 1) / (doc_count + 1)) + 1


train_tfidf_filtered = [calcTfidfval(review, filtered_words, idf_values) for review in train_data]
test_tfidf_filtered = [calcTfidfval(review, filtered_words, idf_values) for review in test_data]

# Training the classifier with the filtered data
train_tfidf_filtered_us, train_labels_filtered_us = undersample(train_tfidf_filtered, train_labels)
clf_filtered = train_bayes_classifier(train_tfidf_filtered_us, train_labels_filtered_us)
test_preds_filtered = [clf_filtered(x) for x in test_tfidf_filtered]
accuracy_filtered = sum(1 for p, t in zip(test_preds_filtered, test_labels) if p == t) / len(test_labels)

print("Improved Model Test Accuracy:", accuracy_filtered)

Tokens before filtering: 5251541
Tokens after filtering: 434
Improved Model Test Accuracy: 0.8003520140805632
