# Install and import required packages

In [4]:
! pip install spacy
! pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
! spacy download en
import spacy
import pandas as pd
import random
from spacy.util import minibatch, compounding
import en_core_web_sm

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 528 kB/s eta 0:00:01
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-py3-none-any.whl size=12019121 sha256=a543f9d4f610b985f962860f4edb169e5926ca69894ce5a00538635fc5a50db5
  Stored in directory: /home/jupyter/.cache/pip/wheels/64/69/41/6f820cf1d7488a0381a2059f66ec9f8f23116f7c67d18f3d8d
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.3.1
    Uninstalling en-core-web-sm-2.3.1:
      Successfully uninstalled en-core-web-sm-2.3.1
Succ

# Load in the dataset which is located on a Google Cloud Bucket

In [5]:
# load in dataset and separate by the __label__ classifier in the text file
data = pd.read_csv('gs://nlp_amazon_data/train.ft.txt', sep="__label__", header = None)

  


# Data cleaning and formatting

View dataframe

In [6]:
data

Unnamed: 0,0,1
0,,2 Stuning even for the non-gamer: This sound t...
1,,2 The best soundtrack ever to anything.: I'm r...
2,,2 Amazing!: This soundtrack is my favorite mus...
3,,2 Excellent Soundtrack: I truly like this soun...
4,,"2 Remember, Pull Your Jaw Off The Floor After ..."
...,...,...
3599995,,1 Don't do it!!: The high chair looks great wh...
3599996,,"1 Looks nice, low functionality: I have used t..."
3599997,,"1 compact, but hard to clean: We have a small ..."
3599998,,1 what is it saying?: not sure what this book ...


Drop the column with null values

In [7]:
data.drop(0, inplace=True, axis=1)

Create a new column called sentiment that uses the sentiment number that is currently in the first column (the label)

In [8]:
data['sentiment'] = data[1].str[0]

Remove the first two characters in the first column, they are no longer needed

In [9]:
data[1] = data[1].str[2:]

Rename the first column

In [10]:
data = data.rename(columns={1: 'review'})

View the data again to ensure it is properly cleaned

In [11]:
data

Unnamed: 0,review,sentiment
0,Stuning even for the non-gamer: This sound tra...,2
1,The best soundtrack ever to anything.: I'm rea...,2
2,Amazing!: This soundtrack is my favorite music...,2
3,Excellent Soundtrack: I truly like this soundt...,2
4,"Remember, Pull Your Jaw Off The Floor After He...",2
...,...,...
3599995,Don't do it!!: The high chair looks great when...,1
3599996,"Looks nice, low functionality: I have used thi...",1
3599997,"compact, but hard to clean: We have a small ho...",1
3599998,what is it saying?: not sure what this book is...,1


# Preparation for NLP

Create a labeled dictionary for use later \
This is the format that will be expected when we train the model

In [12]:
data['cats'] = data['sentiment'].apply(lambda x: {'cats': {'pos': False, 'neg': True}} if x == '1' else {'cats':{'pos': True, 'neg': False}})

In [13]:
data

Unnamed: 0,review,sentiment,cats
0,Stuning even for the non-gamer: This sound tra...,2,"{'cats': {'pos': True, 'neg': False}}"
1,The best soundtrack ever to anything.: I'm rea...,2,"{'cats': {'pos': True, 'neg': False}}"
2,Amazing!: This soundtrack is my favorite music...,2,"{'cats': {'pos': True, 'neg': False}}"
3,Excellent Soundtrack: I truly like this soundt...,2,"{'cats': {'pos': True, 'neg': False}}"
4,"Remember, Pull Your Jaw Off The Floor After He...",2,"{'cats': {'pos': True, 'neg': False}}"
...,...,...,...
3599995,Don't do it!!: The high chair looks great when...,1,"{'cats': {'pos': False, 'neg': True}}"
3599996,"Looks nice, low functionality: I have used thi...",1,"{'cats': {'pos': False, 'neg': True}}"
3599997,"compact, but hard to clean: We have a small ho...",1,"{'cats': {'pos': False, 'neg': True}}"
3599998,what is it saying?: not sure what this book is...,1,"{'cats': {'pos': False, 'neg': True}}"


Shuffle the rows to avoid any bias in the order of observations \
Here we take a sample, but get back every row with frac=1

In [14]:
data = data.sample(frac=1)

Split into train and test datasets \
I have chosen an 80:20 split

In [15]:
train_data = data[:2880000]
test_data = data[2880000:]

Now I convert to lists in order to be used to train the model

In [16]:
list_train_data = train_data[['review','cats']].values.tolist()
list_test_data = test_data[['review','cats']].values.tolist()

Load the en pretrained statistical models for English

In [17]:
nlp = spacy.load('en')

Add the textcat component to the pipeline

In [18]:
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe("textcat", config={"architecture": "simple_cnn"})
    nlp.add_pipe(textcat, last=True)
else:
    textcat = nlp.get_pipe("textcat")

Add our labels, positive and negative 

In [19]:
textcat.add_label("pos")
textcat.add_label("neg")

1

Here we want to disable other pipes, and just train with the textcat pipe that we added in the previous step

In [20]:
training_excluded_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]

In [21]:
with nlp.disable_pipes(training_excluded_pipes):
    optimizer = nlp.begin_training()
    print("Starting Training")
    batch_sizes = compounding(4.0, 32.0, 1.001)

Starting Training


Limiting the training dataset to 10000 observations to reduce training time

In [22]:
small_list_train_data = list_train_data[:10000]

In [23]:
from datetime import datetime

# Training the model

Training loop that iterates 5 times, taking a new batch of data each time, and printing the system time and iteration number. We set drop to 0.2 in order to exclude some data so that the model does not memorize the training examples.

In [22]:
outer = 0
for i in range(5):
    outer = outer + 1
    loss = {}
    random.shuffle(small_list_train_data)
    batches = minibatch(small_list_train_data, size=batch_sizes)
    
    # tracking the iteration and time
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print('Outer:{}, Current Time: {}'.format(outer, current_time))
    
    for batch in batches:
        text, labels = zip(*batch)
        nlp.update(
            text,
            labels,
            drop=0.2,
            sgd=optimizer,
            losses=loss
        )

Outer:1, Current Time: 23:02:46
Outer:2, Current Time: 23:07:50
Outer:3, Current Time: 23:39:43
Outer:4, Current Time: 00:50:54
Outer:5, Current Time: 02:03:32


Save the model to the disk

In [23]:
with nlp.use_params(optimizer.averages):
    nlp.to_disk("model")

# Evaluating the Model

Load the model from the disk

In [24]:
nlp.from_disk("model")

<spacy.lang.en.English at 0x7fbb27821110>

The below function uses the first 10000 observations in the test dataset and determines if the predicted label matches the actual label.

In [44]:
def evaluate_model(
    tokenizer, textcat, test_data: list
) -> dict:
    reviews, labels = zip(*test_data[:10000])
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]
        for predicted_label, score in review.cats.items():
            if (
                predicted_label == "neg"
            ):
                continue
            if score >= 0.5 and true_label["cats"]["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["cats"]["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["cats"]["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["cats"]["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

Here we implement the above function to calculate a precision, recall, and f-score \
The precision score allows use to say that we accuratly predicted 88.20% of the labels \
The recall score allows us to understand how well the model did in identifiying true positives. We know that of all the positive reviews, we correctly predicted 86.45% of them. \
The f-score is calculated from the precision and recall. Since our precision and recall scores were pretty close, it makes sense that the f-score was also close.

In [46]:
with textcat.model.use_params(optimizer.averages):
    evaluation_results = evaluate_model(tokenizer=nlp.tokenizer,textcat=textcat,test_data=list_test_data)
    print(f"\t{evaluation_results['precision']}"f"\t{evaluation_results['recall']}"f"\t{evaluation_results['f-score']}")

	0.8820315661657587	0.8645378817912247	0.8731971153828662


# Testing a specific review

Below I tested a random review on Amazon which was a one star rating. The model did classify it as negative.

In [51]:
TEST_REVIEW = '''If you are looking at this for its built-in ethernet NIC, look elsewhere. This uses the ASIX AX88179 ethernet chip, which is very finicky and will not work with many configurations. Unfortunately, I did not know this at the time of purchase. Fortunately, my wireless works well, so I don't strictly *need* ethernet support--it's just a nice-to-have feature that sets the Core X Chroma off from its less-expensive competition. This review is more about Razer's support practices:
Razer practically begs you in all of its documentation to contact Razer support instead of returning the product if you have problems. Ignore them and just return it. If you contact Razer, they will spend time "troubleshooting" your problem just long enough for Amazon's return window to close, then find a way to drop you and say it isn't their fault. In my case, Razer "support" ran through the usual troubleshooting steps: Power cycle. Install updates. Reinstall the ethernet driver. Reinstall the thunderbolt driver. Plug the ethernet into a different port on the router. Plug the ethernet directly into the modem. Etc. Finally, just as Amazon's return window closed, they hit me with what I thought was just another troubleshooting step: Try to borrow a different laptop to try it with. Seemed like an innocent troubleshooting step to me: we're just trying to isolate the problem to software or configuration and rule out a hardware problem, right? Nope. As soon as I informed them that the ethernet works well with a borrowed laptop, they dropped me saying the Core X Chroma is working "as advertised", it is not their problem, and I should contact the manufacturer of my laptop. Seriously. Razer is a system manufacturer, so they know as well as anybody that no system manufacturer EVER provides support for third-party peripherals. It is clear all Razer "support" was doing was stringing me along until Amazon's return window closed. So now I'm stuck with a device whose major "feature" setting it off from its cheaper competition doesn't work.
Learn from my experience: Just return it. Never, ever contact Razer "support."'''

def test_model(input_data: str = TEST_REVIEW):
    #  Load saved trained model
    loaded_model = spacy.load("model")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
    else:
        prediction = "Negative"
    print(
        f"Review text: {input_data}\nPredicted sentiment: {prediction}"
    )

In [52]:
test_model()

Review text: If you are looking at this for its built-in ethernet NIC, look elsewhere. This uses the ASIX AX88179 ethernet chip, which is very finicky and will not work with many configurations. Unfortunately, I did not know this at the time of purchase. Fortunately, my wireless works well, so I don't strictly *need* ethernet support--it's just a nice-to-have feature that sets the Core X Chroma off from its less-expensive competition. This review is more about Razer's support practices:
Razer practically begs you in all of its documentation to contact Razer support instead of returning the product if you have problems. Ignore them and just return it. If you contact Razer, they will spend time "troubleshooting" your problem just long enough for Amazon's return window to close, then find a way to drop you and say it isn't their fault. In my case, Razer "support" ran through the usual troubleshooting steps: Power cycle. Install updates. Reinstall the ethernet driver. Reinstall the thunder