# Simple Sentiment Analyser

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from tqdm import tqdm
import nltk
import spacy

In [2]:
nltk.download('movie_reviews')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load IMDB Movie Reviews dataset from NLTK
movie_reviews_data = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

# Extract texts and labels
texts, labels = zip(*movie_reviews_data)

# Convert labels to binary (0 for negative, 1 for positive)
labels = [0 if label == 'neg' else 1 for label in labels]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [3]:
# Function to calculate document embeddings using spaCy
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        doc = nlp(text)
        # Average word vectors to get document vector
        doc_vector = np.mean([word.vector for word in doc if word.has_vector], axis=0)
        embeddings.append(doc_vector)
    return np.array(embeddings)

# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [03:05<00:00,  8.63it/s]
100%|██████████| 400/400 [00:45<00:00,  8.85it/s]


In [9]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.

# create a array to save accuracies for different numbers
components_array_accuracy = []

for num in range(10 , 97):
    model = make_pipeline(TruncatedSVD(n_components=num), SVC())

    # Fit the model on training data
    model.fit(X_train_embeddings, y_train)

    # Make predictions on test data
    predictions = model.predict(X_test_embeddings)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, predictions)
    components_array_accuracy.append((num , accuracy))
    print(f"n_components={num}  =>  Accuracy: {accuracy}")

n_components=10  =>  Accuracy: 0.6475
n_components=11  =>  Accuracy: 0.6525
n_components=12  =>  Accuracy: 0.6575
n_components=13  =>  Accuracy: 0.6625
n_components=14  =>  Accuracy: 0.65
n_components=15  =>  Accuracy: 0.65
n_components=16  =>  Accuracy: 0.6625
n_components=17  =>  Accuracy: 0.655
n_components=18  =>  Accuracy: 0.65
n_components=19  =>  Accuracy: 0.66
n_components=20  =>  Accuracy: 0.66
n_components=21  =>  Accuracy: 0.655
n_components=22  =>  Accuracy: 0.6575
n_components=23  =>  Accuracy: 0.6625
n_components=24  =>  Accuracy: 0.665
n_components=25  =>  Accuracy: 0.6675
n_components=26  =>  Accuracy: 0.6675
n_components=27  =>  Accuracy: 0.6675
n_components=28  =>  Accuracy: 0.6675
n_components=29  =>  Accuracy: 0.675
n_components=30  =>  Accuracy: 0.6825
n_components=31  =>  Accuracy: 0.685
n_components=32  =>  Accuracy: 0.68
n_components=33  =>  Accuracy: 0.685
n_components=34  =>  Accuracy: 0.68
n_components=35  =>  Accuracy: 0.67
n_components=36  =>  Accuracy: 0.6

In [12]:
# Find maximum index of accuracy List
inx = np.argmax([max_inx[1] for max_inx in components_array_accuracy])
the_best_index_of_component , max_accuracy = components_array_accuracy[inx]
print(f"the_best_index_of_component: {the_best_index_of_component} => Maximum_Accuracy: {max_accuracy}")

the_best_index_of_component: 72 => Maximum_Accuracy: 0.7


#More accurate one

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from transformers import AutoTokenizer, AutoModel
import torch

In [7]:
# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [13]:
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        # Tokenize the text
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Get BERT model output
        with torch.no_grad():
            model_output = bert_model(**tokens)

        # Use mean pooling to get sentence embeddings
        doc_vector = np.mean(model_output['last_hidden_state'].numpy(), axis=1)
        doc_vector = np.squeeze(doc_vector, axis=0)
        embeddings.append(doc_vector)

    return np.array(embeddings)


# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

print(f"X_train_embeddings= {X_train_embeddings.shape} , X_test_embeddings= {X_test_embeddings.shape}")



100%|██████████| 1600/1600 [48:15<00:00,  1.81s/it]
100%|██████████| 400/400 [10:46<00:00,  1.62s/it]

X_train_embeddings= (1600, 768) , X_test_embeddings= (400, 768)





In [15]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.

# create a array to save accuracies for different the amount of n_components
bert_accuracy = []

for num in range(10 , 257):
    model = make_pipeline(TruncatedSVD(n_components=num), SVC())

    # Fit the model on training data
    model.fit(X_train_embeddings, y_train)

    # Make predictions on test data
    predictions = model.predict(X_test_embeddings)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, predictions)
    bert_accuracy.append((num , accuracy))
    print(f"n_components= {num}  =>  Accuracy: {accuracy}")

n_components= 10  =>  Accuracy: 0.6475
n_components= 11  =>  Accuracy: 0.64
n_components= 12  =>  Accuracy: 0.655
n_components= 13  =>  Accuracy: 0.645
n_components= 14  =>  Accuracy: 0.645
n_components= 15  =>  Accuracy: 0.6525
n_components= 16  =>  Accuracy: 0.665
n_components= 17  =>  Accuracy: 0.6625
n_components= 18  =>  Accuracy: 0.6775
n_components= 19  =>  Accuracy: 0.68
n_components= 20  =>  Accuracy: 0.6925
n_components= 21  =>  Accuracy: 0.7075
n_components= 22  =>  Accuracy: 0.72
n_components= 23  =>  Accuracy: 0.725
n_components= 24  =>  Accuracy: 0.7375
n_components= 25  =>  Accuracy: 0.7325
n_components= 26  =>  Accuracy: 0.7475
n_components= 27  =>  Accuracy: 0.745
n_components= 28  =>  Accuracy: 0.7425
n_components= 29  =>  Accuracy: 0.77
n_components= 30  =>  Accuracy: 0.765
n_components= 31  =>  Accuracy: 0.7675
n_components= 32  =>  Accuracy: 0.765
n_components= 33  =>  Accuracy: 0.765
n_components= 34  =>  Accuracy: 0.76
n_components= 35  =>  Accuracy: 0.7725
n_com

In [16]:
# Find maximum index of accuracy List
inx = np.argmax([max_inx[1] for max_inx in bert_accuracy])
the_best_index_of_component , max_accuracy = bert_accuracy[inx]

print(f"the_best_index_of_component: {the_best_index_of_component} => Maximum_Accuracy: {max_accuracy}")


the_best_index_of_component: 87 => Maximum_Accuracy: 0.8075
