In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jsana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jsana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jsana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load dataset
df = pd.read_csv("modified_dataset.csv")  

# 1. Convert to Lowercase
df['review'] = df['review'].astype(str).str.lower()

# 2. Remove URLs
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)
df['review'] = df['review'].apply(remove_urls)

# 3. Remove HTML Tags
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)
df['review'] = df['review'].apply(remove_html_tags)

# 4. Remove Punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))
df['review'] = df['review'].apply(remove_punctuation)

# 5. Handle Contractions (Optional Improvement)
from contractions import fix
df['review'] = df['review'].apply(fix)  # Example: "don't" → "do not"

# 6. Remove Stopwords (But Keep Negation Words)
stop_words = set(stopwords.words('english'))
negation_words = {"not", "no", "never", "none", "nothing", "nowhere", "neither", "hardly", "barely"}

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words or word in negation_words]
    return ' '.join(filtered_words)

df['review'] = df['review'].apply(remove_stopwords)

# 7. Tokenization
df['tokenized_review'] = df['review'].apply(word_tokenize)

# 8. Lemmatization (Preferred over Stemming)
lemmatizer = WordNetLemmatizer()
def lemmatize_words(tokens):
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])
df['lemmatized_review'] = df['tokenized_review'].apply(lemmatize_words)

# 9. Drop duplicates & missing values (Final cleanup)
df.drop_duplicates(subset=['lemmatized_review'], inplace=True)
df.dropna(subset=['lemmatized_review'], inplace=True)

# Save cleaned dataset
df.to_csv("cleaned_dataset.csv", index=False)

In [3]:
df.head()

Unnamed: 0,review,sentiment,tokenized_review,lemmatized_review
0,wow loved place,positive,"[wow, loved, place]",wow loved place
1,crust not good,negative,"[crust, not, good]",crust not good
2,not tasty texture nasty,negative,"[not, tasty, texture, nasty]",not tasty texture nasty
3,stopped late may bank holiday rick steve recom...,positive,"[stopped, late, may, bank, holiday, rick, stev...",stopped late may bank holiday rick steve recom...
4,selection menu great prices,positive,"[selection, menu, great, prices]",selection menu great price


In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("cleaned_dataset.csv") 

# Convert labels to numerical format (assuming binary classification: positive/negative)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})  # Modify if more classes exist

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))  # Includes unigrams & bigrams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make Predictions
y_pred = model.predict(X_test_tfidf)

# Save predictions to a CSV file
predictions_df = pd.DataFrame({'review': X_test, 'actual_sentiment': y_test, 'predicted_sentiment': y_pred})
predictions_df.to_csv("tf_idf_predictions.csv", index=False)

# Analyze Misclassified Data
misclassified_df = predictions_df[predictions_df['actual_sentiment'] != predictions_df['predicted_sentiment']].copy()

# Identify possible misclassification reasons
def analyze_misclassification(review):
    if re.search(r'\b(not|never|no|none|hardly|barely|scarcely)\b', review):
        return "Possible Negation Issue"
    elif len(review.split()) < 5:
        return "Too Short for Meaningful Classification"
    elif re.search(r'(:-\)|:-\()', review):  # Detect emoticons
        return "Possible Sarcasm"
    else:
        return "Unclear"

misclassified_df['misclassification_reason'] = misclassified_df['review'].apply(analyze_misclassification)

# Save misclassified reviews
misclassified_df.to_csv("tf_idf_misclassified_data.csv", index=False)

# Print classification report
print("Model Performance:")
print(classification_report(y_test, y_pred))


Model Performance:
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       100
           1       0.86      0.78      0.82        98

    accuracy                           0.83       198
   macro avg       0.83      0.83      0.83       198
weighted avg       0.83      0.83      0.83       198



In [5]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK tokenizer
nltk.download('punkt')

# Load dataset
df = pd.read_csv("cleaned_dataset.csv")  # Ensure preprocessed dataset is used

# 1. Tokenization
df['tokenized_review'] = df['review'].apply(word_tokenize)

# 2. Train Word2Vec Model
word2vec_model = Word2Vec(sentences=df['tokenized_review'], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

# 3. One-Hot Encode Words (Fixed)
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")  # ✅ Fix applied
unique_words = list(word_vectors.index_to_key)  # Unique words from Word2Vec vocabulary
word_array = np.array(unique_words).reshape(-1, 1)
encoder.fit(word_array)

# 4. Convert Reviews into Word2Vec Averages
def review_to_vector(tokens):
    vectors = [word_vectors[word] for word in tokens if word in word_vectors]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)  # Handle empty cases

df['review_vector'] = df['tokenized_review'].apply(review_to_vector)

# 5. Prepare Data
X = np.vstack(df['review_vector'])
y = df['sentiment']

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 7. Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# 8. Make Predictions
y_pred = model.predict(X_test)

# 9. Save Predictions
df_test = df.iloc[y_test.index].copy()
df_test['predicted_sentiment'] = y_pred
df_test.to_csv("word2vec_predictions.csv", index=False)

# 10. Identify Misclassified Samples
misclassified = df_test[df_test['sentiment'] != df_test['predicted_sentiment']].copy()

# Function to determine misclassification reasons
def get_misclassification_reason(row):
    true_sentiment = row['sentiment']
    predicted_sentiment = row['predicted_sentiment']
    review_text = row['review']

    if true_sentiment == 'positive' and predicted_sentiment == 'negative':
        if "not" in review_text or "but" in review_text:
            return "Misinterpreted contrast or negation."
        elif any(word in review_text for word in ['sarcasm', 'joke', 'irony']):
            return "Possible sarcasm detection failure."
        else:
            return "Failed to recognize positive sentiment words."

    elif true_sentiment == 'negative' and predicted_sentiment == 'positive':
        if any(word in review_text for word in ['disappointed', 'worst', 'awful']):
            return "Negative sentiment possibly diluted by surrounding words."
        elif len(review_text.split()) < 5:
            return "Short negative review misclassified."
        else:
            return "Failed to capture intensity of negative sentiment."

    return "Unclear misclassification reason."

# Apply the function
misclassified['reason'] = misclassified.apply(get_misclassification_reason, axis=1)

# 11. Save Misclassified Data
misclassified.to_csv("misclassified_word2vec.csv", index=False)

# 12. Print Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jsana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.5202020202020202
              precision    recall  f1-score   support

    negative       0.70      0.07      0.13        99
    positive       0.51      0.97      0.67        99

    accuracy                           0.52       198
   macro avg       0.61      0.52      0.40       198
weighted avg       0.61      0.52      0.40       198



In [6]:
import numpy as np

# Path to the downloaded GloVe file
glove_path = "glove.6B.100d.txt"

# Load GloVe embeddings
glove_embeddings = {}

with open(glove_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]  # First value is the word
        vector = np.array(values[1:], dtype="float32")  # Remaining values are vector components
        glove_embeddings[word] = vector

print("GloVe embeddings loaded successfully!")


GloVe embeddings loaded successfully!


In [7]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download tokenizer
nltk.download("punkt")

# Load dataset
df = pd.read_csv("cleaned_dataset.csv")  # Make sure this file is preprocessed
df["tokenized_review"] = df["review"].apply(word_tokenize)

# Function to convert a review into a GloVe vector (average of word embeddings)
def review_to_glove_vector(tokens):
    vectors = [glove_embeddings[word] for word in tokens if word in glove_embeddings]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)  # Handle empty cases

# Convert each review to a vector
df["review_vector"] = df["tokenized_review"].apply(review_to_glove_vector)

# Prepare Data for Model
X = np.vstack(df["review_vector"])
y = df["sentiment"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Save Predictions
df_test = df.iloc[y_test.index].copy()
df_test["predicted_sentiment"] = y_pred
df_test.to_csv("glove_predictions.csv", index=False)

# Identify Misclassified Samples
misclassified = df_test[df_test["sentiment"] != df_test["predicted_sentiment"]]
misclassified["reason"] = np.where(
    misclassified["sentiment"] == "positive",
    "Misclassified positive review: Model failed to detect positive sentiment.",
    "Misclassified negative review: Model failed to detect negative sentiment."
)

# Save Misclassified Data
misclassified.to_csv("misclassified_glove.csv", index=False)

# Print Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jsana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.8080808080808081
              precision    recall  f1-score   support

    negative       0.84      0.77      0.80        99
    positive       0.79      0.85      0.82        99

    accuracy                           0.81       198
   macro avg       0.81      0.81      0.81       198
weighted avg       0.81      0.81      0.81       198



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclassified["reason"] = np.where(


In [8]:
import urllib.request
import os

# Define URL and filenames
fasttext_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz"
fasttext_file = "cc.en.300.vec"

# Download FastText embeddings if not already present
if not os.path.exists(fasttext_file):
    print("Downloading FastText embeddings... (This may take a while)")
    urllib.request.urlretrieve(fasttext_url, fasttext_file + ".gz")
    print("Download complete!")

    # Extract the file
    import gzip
    import shutil

    with gzip.open(fasttext_file + ".gz", "rb") as f_in, open(fasttext_file, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
    
    print("Extraction complete!")

print("FastText embeddings ready for use!")


FastText embeddings ready for use!


In [9]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download tokenizer
nltk.download("punkt")

# Load dataset
df = pd.read_csv("cleaned_dataset.csv")  # Make sure this file is preprocessed
df["tokenized_review"] = df["review"].apply(word_tokenize)

# Load FastText Embeddings
fasttext_embeddings = {}

print("Loading FastText embeddings... This may take a while.")
with open("cc.en.300.vec", "r", encoding="utf-8") as f:
    next(f)  # Skip first line (metadata)
    for line in f:
        values = line.strip().split()
        word = values[0]  # First value is the word
        vector = np.array(values[1:], dtype="float32")  # Remaining values are vector components
        fasttext_embeddings[word] = vector
print("FastText embeddings loaded successfully!")

# Function to convert a review into a FastText vector (average of word embeddings)
def review_to_fasttext_vector(tokens):
    vectors = [fasttext_embeddings[word] for word in tokens if word in fasttext_embeddings]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)  # Handle empty cases

# Convert each review to a vector
df["review_vector"] = df["tokenized_review"].apply(review_to_fasttext_vector)

# Prepare Data for Model
X = np.vstack(df["review_vector"])
y = df["sentiment"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Save Predictions
df_test = df.iloc[y_test.index].copy()
df_test["predicted_sentiment"] = y_pred
df_test.to_csv("fasttext_predictions.csv", index=False)

# Identify Misclassified Samples
misclassified = df_test[df_test["sentiment"] != df_test["predicted_sentiment"]]
misclassified["reason"] = np.where(
    misclassified["sentiment"] == "positive",
    "Misclassified positive review: Model failed to detect positivity, possibly due to sarcasm or complex phrasing.",
    "Misclassified negative review: Model failed to capture negativity, likely due to subtle sentiment or domain-specific words."
)

# Save Misclassified Data
misclassified.to_csv("misclassified_fasttext.csv", index=False)

# Print Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jsana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading FastText embeddings... This may take a while.
FastText embeddings loaded successfully!
Accuracy: 0.8484848484848485
              precision    recall  f1-score   support

    negative       0.86      0.84      0.85        99
    positive       0.84      0.86      0.85        99

    accuracy                           0.85       198
   macro avg       0.85      0.85      0.85       198
weighted avg       0.85      0.85      0.85       198



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclassified["reason"] = np.where(


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("modified_dataset.csv")

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class SentimentDataset(Dataset):
    def __init__(self, reviews, sentiments, tokenizer, max_len=128):
        self.reviews = reviews
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.reviews[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(self.sentiments[idx], dtype=torch.long)
        }

# Prepare dataset
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2, random_state=42)
train_dataset = SentimentDataset(X_train.tolist(), y_train.tolist(), tokenizer)
test_dataset = SentimentDataset(X_test.tolist(), y_test.tolist(), tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define Model
class BERT_NN(nn.Module):
    def __init__(self, bert_model):
        super(BERT_NN, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(256, 2)  # Output layer for classification

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        x = self.fc(cls_embedding)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.out(x)
        return x, cls_embedding

# Load Pretrained BERT
bert_model = BertModel.from_pretrained("bert-base-uncased")
model = BERT_NN(bert_model).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Training
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, train_loader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["label"].to(device)
            optimizer.zero_grad()
            outputs, _ = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

train_model(model, train_loader, criterion, optimizer)

# Extract BERT Embeddings for SVM
model.eval()
all_embeddings = []
all_labels = []
with torch.no_grad():
    for batch in train_loader:
        input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["label"].to(device)
        _, cls_embeddings = model(input_ids, attention_mask)
        all_embeddings.append(cls_embeddings.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

X_train_svm = np.concatenate(all_embeddings, axis=0)
y_train_svm = np.concatenate(all_labels, axis=0)

# Train SVM
svm = SVC(kernel='linear')
svm.fit(X_train_svm, y_train_svm)

# Evaluate SVM
all_embeddings = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch["input_ids"].to(device), batch["attention_mask"].to(device), batch["label"].to(device)
        _, cls_embeddings = model(input_ids, attention_mask)
        all_embeddings.append(cls_embeddings.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

X_test_svm = np.concatenate(all_embeddings, axis=0)
y_test_svm = np.concatenate(all_labels, axis=0)
y_pred_svm = svm.predict(X_test_svm)
accuracy = accuracy_score(y_test_svm, y_pred_svm)
print(f"SVM Accuracy: {accuracy:.4f}")

  from .autonotebook import tqdm as notebook_tqdm


Epoch 1, Loss: 0.4980
Epoch 2, Loss: 0.1467
Epoch 3, Loss: 0.0528
SVM Accuracy: 0.9750


In [11]:
# Save predicted and misclassified reviews
df_test = pd.DataFrame({"review": X_test.tolist(), "actual": y_test_svm, "predicted": y_pred_svm})
df_test.to_csv("bert_svm_predictions.csv", index=False)

misclassified = df_test[df_test["actual"] != df_test["predicted"]]
misclassified.to_csv("bert_svm_misclassified_reviews.csv", index=False)