# Sentiment Analysis for Movie Reviews Using spaCy

## Preprocess and Clean Text Data

### Tokenization

In [1]:
import spacy

text = """Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.
"""

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
token_list = [token for token in doc]
len(token_list), token_list[:10]

(66, [Dave, watched, as, the, forest, burned, up, on, the, hill])

### Stop Words Removal

In [2]:
filtered_tokens = [token for token in doc if not token.is_stop]
len(filtered_tokens), filtered_tokens[:10]

(33, [Dave, watched, forest, burned, hill, ,, , miles, house, .])

### Normalization

In [3]:
lemmas = [
    f"Token: {token}, lemma: {token.lemma_}"
    for token in filtered_tokens
]
lemmas[:10]

['Token: Dave, lemma: Dave',
 'Token: watched, lemma: watch',
 'Token: forest, lemma: forest',
 'Token: burned, lemma: burn',
 'Token: hill, lemma: hill',
 'Token: ,, lemma: ,',
 'Token: \n, lemma: \n',
 'Token: miles, lemma: mile',
 'Token: house, lemma: house',
 'Token: ., lemma: .']

**Note**: Notice the underscore on the .lemma_ attribute. That’s not a typo. It’s a convention in spaCy that gets the human-readable version of the attribute.

### Vectorization

In [4]:
filtered_tokens[1].vector

array([ 0.20839304,  0.5856737 , -0.3835841 , -0.3657956 , -0.5178656 ,
       -0.34037828, -1.2620096 , -0.91556275,  0.21835369, -0.16563462,
        0.20612994,  0.05726175, -0.21191174, -0.92969424, -0.4303094 ,
        0.51758033, -0.889028  ,  0.07933406,  0.68066394,  0.89442044,
       -0.6108595 , -0.277177  , -0.11096446, -0.3628238 , -0.94987136,
       -0.21182631, -0.14455077,  1.0258864 , -0.81838334, -0.60359097,
       -0.8117833 , -0.7980415 ,  0.01387411,  0.6208161 , -0.38254207,
       -0.05336958, -0.2067923 , -0.43317574,  0.12423342, -1.3077857 ,
        1.9532588 ,  0.16160944,  0.62362003, -0.7017921 ,  0.03761351,
       -0.41422376,  0.2448186 , -0.5293729 ,  1.0770528 ,  0.9272019 ,
       -1.1789931 ,  0.41565514, -0.5178951 , -0.87256116,  0.5209959 ,
        1.0314604 ,  2.2510653 ,  0.64014685, -0.15179029, -0.11628479,
        0.20546085, -0.31134105, -0.5156218 ,  1.4576002 ,  0.74580956,
       -0.5377724 , -0.4383081 , -1.0885042 ,  0.01137999,  1.26

## Sentiment Prediction with a Machine Learning Classifier

### How Classification Works

1. Split your data into training and evaluation sets.
2. Select a model architecture.
3. Use training data to train your model.
4. Use test data to evaluate the performance of your model.
5. Use your trained model on new data to generate predictions, which in this case will be a number between -1.0 and 1.0.

### How to Use spaCy for Classification

1. Add the `textcat` component to the existing pipeline.
2. Add valid labels to the `textcat` component.
3. Load, shuffle, and split your data.
4. Train the model, evaluating on each training loop.
5. Use the trained model to predict the sentiment of non-training data.
6. Optionally, save the trained model.

## Building an NLP Sentiment Analyzer

In [5]:
import os
import random

import spacy
from spacy.training import Example
from spacy.util import compounding, minibatch


def load_training_data(
    data_directory: str = "data/aclImdb/train",
    split: float = 0.8,
    limit: int = 0,
) -> tuple:
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if not review.endswith(".txt"):
                continue
            with open(f"{labeled_directory}/{review}", encoding="utf-8") as f:
                text = f.read()
                text = text.replace("<br />", "\n\n")
                if text.strip():
                    spacy_label = {
                        "cats": {
                            "pos": "pos" == label,
                            "neg": "neg" == label,
                        }
                    }
                    reviews.append((text, spacy_label))
    random.shuffle(reviews)

    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    return reviews[:split], reviews[split:]


def evaluate_model(
    tokenizer,
    textcat,
    test_data: list,
) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels. You can get all
            # the info you need with just the pos label.
            if predicted_label == "neg":
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}


def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20,
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.add_pipe("textcat", last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]

    with nlp.disable_pipes(training_excluded_pipes):
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-score")

        optimizer = nlp.create_optimizer()

        # Training loop
        for i in range(iterations):
            print(f"Training iteration {i}")
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(
                training_data,
                size=compounding(4.0, 32.0, 1.001),
            )

            for batch in batches:
                examples = [
                    Example.from_dict(nlp.make_doc(text), labels)
                    for text, labels in batch
                ]
                nlp.update(examples, drop=0.2, sgd=optimizer, losses=loss)

            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data,
                )
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                )

    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")


def test_model(input_data: str) -> None:
    #  Load saved trained model
    loaded_model = spacy.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    print(
        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}"
    )


if __name__ == "__main__":
    train, test = load_training_data(limit=2500)
    train_model(train, test)

    test_review = """Transcendently beautiful in moments outside the office,
    it seems almost sitcom-like in those scenes. When Toni Colette walks out
    and ponders life silently, it's gorgeous.<br /><br />The movie doesn't
    seem to decide whether it's slapstick, farce, magical realism, or drama,
    but the best of it doesn't matter. (The worst is sort of tedious - like
    Office Space with less humor.)
    """
    test_model(test_review)

Beginning training
Loss	Precision	Recall	F-score
Training iteration 0


ValueError: Cannot get dimension 'nO' for model 'sparse_linear': value unset