# Simple Sentiment Analyser

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from tqdm import tqdm
import nltk
import spacy

In [6]:
nltk.download('movie_reviews')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load IMDB Movie Reviews dataset from NLTK
movie_reviews_data = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

# Extract texts and labels
texts, labels = zip(*movie_reviews_data)

# Convert labels to binary (0 for negative, 1 for positive)
labels = [0 if label == 'neg' else 1 for label in labels]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [7]:
# Function to calculate document embeddings using spaCy
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        doc = nlp(text) # tokenizes the text to produce a Doc object
        # Average word vectors to get document vector
        doc_vector = np.mean([token.vector for token in doc if token.has_vector], axis=0) # This is done by extracting the vector of each token in the document if the token has a vector
        embeddings.append(doc_vector)
    return np.array(embeddings)

# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [03:02<00:00,  8.75it/s]
100%|██████████| 400/400 [00:43<00:00,  9.29it/s]


In [8]:
# Split a subset of the training set for tuning
X_train_tune, X_val, y_train_tune, y_val = train_test_split(X_train_embeddings, y_train, test_size=0.2, random_state=42)

# Define a range of values for n_components
n_components_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95]

best_accuracy = 0
best_n_components = None

# Iterate over different values of n_components
for n_components in n_components_values:
    model = make_pipeline(TruncatedSVD(n_components=n_components), SVC())

    # Fit the model on training subset
    model.fit(X_train_tune, y_train_tune)

    # Make predictions on validation set
    predictions = model.predict(X_val)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_val, predictions)

    # Print the accuracy for this value of n_components
    print(f"Accuracy for n_components={n_components}: {accuracy}")

    # Update the best accuracy and best n_components if necessary
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_n_components = n_components

# Print the best value found
print(f"Best n_components: {best_n_components}, Best Accuracy: {best_accuracy}")

model = make_pipeline(TruncatedSVD(n_components=best_n_components), SVC())

# Fit the model on training data
model.fit(X_train_embeddings, y_train)

# Make predictions on test data
predictions = model.predict(X_test_embeddings)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy for n_components=10: 0.64375
Accuracy for n_components=20: 0.65625
Accuracy for n_components=30: 0.68125
Accuracy for n_components=40: 0.678125
Accuracy for n_components=50: 0.675
Accuracy for n_components=60: 0.66875
Accuracy for n_components=70: 0.68125
Accuracy for n_components=80: 0.68125
Accuracy for n_components=90: 0.678125
Accuracy for n_components=95: 0.678125
Best n_components: 30, Best Accuracy: 0.68125
Accuracy: 0.6825


#More accurate one

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

In [11]:
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        # Tokenize the text
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True) # The tokenizer converts the text into a format suitable for the BERT model, returning tensors

        # Get BERT model output
        with torch.no_grad():
            model_output = bert_model(**tokens) # processes the tokens and returns its output

        # Use mean pooling to get sentence embeddings
        # Mean pooling is a technique to aggregate token embeddings into a single sentence embedding by averaging them. It provides a fixed-size representation for variable-length input.
        doc_vector = torch.mean(model_output.last_hidden_state, dim=1).squeeze().numpy()
        embeddings.append(doc_vector)

    return np.array(embeddings)


# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [29:20<00:00,  1.10s/it]
100%|██████████| 400/400 [07:15<00:00,  1.09s/it]


In [12]:
# Split a subset of the training set for tuning
X_train_tune, X_val, y_train_tune, y_val = train_test_split(X_train_embeddings, y_train, test_size=0.2, random_state=42)

# Define a range of values for n_components
n_components_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95]

best_accuracy = 0
best_n_components = None

# Iterate over different values of n_components
for n_components in n_components_values:
    model = make_pipeline(TruncatedSVD(n_components=n_components), SVC())

    # Fit the model on training subset
    model.fit(X_train_tune, y_train_tune)

    # Make predictions on validation set
    predictions = model.predict(X_val)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_val, predictions)

    # Print the accuracy for this value of n_components
    print(f"Accuracy for n_components={n_components}: {accuracy}")

    # Update the best accuracy and best n_components if necessary
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_n_components = n_components

# Print the best value found
print(f"Best n_components: {best_n_components}, Best Accuracy: {best_accuracy}")

model = make_pipeline(TruncatedSVD(n_components=best_n_components), SVC())

# Fit the model on training data
model.fit(X_train_embeddings, y_train)

# Make predictions on test data
predictions = model.predict(X_test_embeddings)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy for n_components=10: 0.65625
Accuracy for n_components=20: 0.7375
Accuracy for n_components=30: 0.753125
Accuracy for n_components=40: 0.78125
Accuracy for n_components=50: 0.775
Accuracy for n_components=60: 0.775
Accuracy for n_components=70: 0.775
Accuracy for n_components=80: 0.775
Accuracy for n_components=90: 0.78125
Accuracy for n_components=95: 0.784375
Best n_components: 95, Best Accuracy: 0.784375
Accuracy: 0.7975
