# Simple Sentiment Analyser

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from tqdm import tqdm
import nltk
import spacy

In [None]:
nltk.download('movie_reviews')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load IMDB Movie Reviews dataset from NLTK
movie_reviews_data = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

# Extract texts and labels
texts, labels = zip(*movie_reviews_data)
# Convert labels to binary (0 for negative, 1 for positive)
labels = [1 if item=='pos' else 0 for item in labels]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [None]:
# Function to calculate document embeddings using spaCy
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        doc = nlp(text)
        doc_vector = np.mean([token.vector for token in doc], axis=0)
        embeddings.append(doc_vector)
    return np.array(embeddings)

# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [02:54<00:00,  9.18it/s]
100%|██████████| 400/400 [00:54<00:00,  7.30it/s]


In [None]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.
for n in [5,10,20,50,96]:
  model = make_pipeline(TruncatedSVD(n_components=n), SVC())

  # Fit the model on training data
  model.fit(X_train_embeddings, y_train)

  # Make predictions on test data
  predictions = model.predict(X_test_embeddings)

  # Evaluate the accuracy
  accuracy = accuracy_score(y_test, predictions)
  print(f"Accuracy: {accuracy} with n_components: {n}")

Accuracy: 0.585 with n_components: 5
Accuracy: 0.6475 with n_components: 10
Accuracy: 0.66 with n_components: 20
Accuracy: 0.68 with n_components: 50
Accuracy: 0.6925 with n_components: 96


#More accurate one

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)

In [None]:
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        # Tokenize the text
        # tokenizer.to(device)
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

        # Get BERT model output
        with torch.no_grad():
            model_output = bert_model(**tokens)
        # print(model_output.last_hidden_state.shape)
        # break
        # Use mean pooling to get sentence embeddings
        doc_vector = model_output.last_hidden_state.mean(dim = 1)
        embeddings.append(doc_vector.cpu())

    return np.array(embeddings)


# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [01:01<00:00, 26.20it/s]
100%|██████████| 400/400 [00:14<00:00, 27.67it/s]


In [None]:
X_train_embeddings = X_train_embeddings.squeeze()
X_test_embeddings = X_test_embeddings.squeeze()

In [None]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.
for n in [50,100,200,300,400,500,600,700,768]:
  model = make_pipeline(TruncatedSVD(n_components=n), SVC())

  # Fit the model on training data
  model.fit(X_train_embeddings, y_train)

  # Make predictions on test data
  predictions = model.predict(X_test_embeddings)

  # Evaluate the accuracy
  accuracy = accuracy_score(y_test, predictions)
  print(f"Accuracy: {accuracy} with n_components: {n}")

Accuracy: 0.7775 with n_components: 50
Accuracy: 0.8 with n_components: 100
Accuracy: 0.7975 with n_components: 200
Accuracy: 0.8 with n_components: 300
Accuracy: 0.8025 with n_components: 400
Accuracy: 0.8 with n_components: 500
Accuracy: 0.8 with n_components: 600
Accuracy: 0.8 with n_components: 700
Accuracy: 0.8 with n_components: 768
