In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import spacy
import random

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
    
<h2 class="list-group-item list-group-item-action active" data-toggle="list" style='background:linear-gradient(to right, #1E90FF 0%, #4B0082 100%); border:0; color:#FFF5EE' role="tab" aria-controls="home"><center>Content</center></h2>

* [Exploration](#1)
* [SpaCy training and evaluation](#2)

<a id="1"></a>
<h2 style='background:linear-gradient(to right, #1E90FF 0%, #4B0082 100%); border:0; color:#FFF5EE'><center>Data Exploration<center><h2>

In [None]:
data = pd.read_csv("/kaggle/input/amazon-music-reviews/Musical_instruments_reviews.csv")

data.head()

In [None]:
data.info()

In [None]:
data["overall"].value_counts()

In [None]:
data["reviewText"] = data["reviewText"].fillna("")
data["review"] = data["reviewText"] + ". " + data["summary"]
data["score"] = data["overall"].apply(lambda x: 1 if x > 3.5 else 0)

data = data[["review", "score"]]
data.head()

In [None]:
data["score"].value_counts()

In [None]:
sns.set(style="whitegrid", context="notebook", font_scale=1.5)
fig, ax = plt.subplots(1, 1, figsize=(12, 10))

arr = data["review"].apply(lambda x: len(x.split()))
sns.histplot(arr, bins=30, ax=ax)

ax.set_xlabel("words count in a review")
ax.set_ylabel("reviews count")
plt.show()

In [None]:
df_train, df_test = train_test_split(data, test_size=0.2, random_state=0)

<a id="2"></a>
<h2 style='background:linear-gradient(to right, #1E90FF 0%, #4B0082 100%); border:0; color:#FFF5EE'><center>SpaCy training and evaluation<center><h2>

Text classification with spaCy 2.3.5, available here, is somewhat different than it would be in spaCy 3. Namely, the latter introduces transormer model as a part of the pipeline and nudges the user towards using "config" files.

In [None]:
spacy.__version__

In [None]:
nlp = spacy.load("en_core_web_sm")
textcat = nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat, last=True)
nlp.pipe_names

Add labels to classifier

In [None]:
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

In [None]:
train_prep = [(r["review"], {"cats" : {"POSITIVE" : bool(r["score"]), "NEGATIVE" : bool(1 - r["score"])}}) for idx, r in df_train.iterrows()] 
train_prep[:3]

The code below is a modification of the code from this [blog post](https://www.machinelearningplus.com/nlp/custom-text-classification-spacy/)

In [None]:
random.shuffle(train_prep)
valid_prep = train_prep[:int(0.1*len(train_prep))]
train_prep = train_prep[int(0.1*len(train_prep)):]
print(f"train size : {len(train_prep)} | valid size : {len(valid_prep)} | test size : {len(df_test)}")

In [None]:
def evaluate(model, annoted_texts, treshold=0.5):
    docs = [model(text[0]) for text in annoted_texts]
    TP = 0.0
    FP = 0.0
    FN = 0.0
    TN = 0.0
    for i, doc in enumerate(docs):
        gold = annoted_texts[i][1]["cats"]["POSITIVE"]
        try:
            score = doc.cats["POSITIVE"]
        except KeyError: continue
            
        if score >= treshold and gold >= 0.5:
            TP += 1.0
        elif score >= treshold and gold < 0.5:
            FP += 1.0
        elif score < treshold and gold < 0.5:
            TN += 1.0
        elif score < treshold and gold >= 0.5:
            FN += 1.0
            
    precision = TP / (TP + FP + 1e-8)
    recall = TP / (TP + FN + 1e-8)
    accuracy = (TP + TN) / (TP + TN + FP + FN + 1e-8)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
    return {"acc": accuracy, "prec": precision, "rec": recall, "f1": f1}

In [None]:
n_iter = 10

# Disabling other components, train only classifier
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
best_score = 0
patience = 2
no_improvement = 0

with nlp.disable_pipes(*unaffected_pipes): 
    optimizer = nlp.begin_training()

    print("Training the model...")
    print('{:^7}\t{:^7}\t{:^7}\t{:^7}\t{:^7}'.format('LOSS', 'Acc', 'Pr', 'Rec', 'F1'))

    for i in range(n_iter):
        random.shuffle(train_prep)
        
        losses = {}
        batches = spacy.util.minibatch(train_prep, size=spacy.util.compounding(4., 64., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
            
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp, valid_prep)
        
        if scores['acc'] > best_score:
            best_score = scores['acc']
            best_textcat = nlp.get_pipe("textcat")
            no_improvement = 0
        else: no_improvement += 1
        
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'.format(losses['textcat'], scores['acc'], scores['prec'], scores['rec'], scores['f1']))
        if no_improvement == patience: break
            
    with nlp.use_params(optimizer.averages):
        nlp.replace_pipe("textcat", best_textcat)

In [None]:
test_prep = [(r["review"], {"cats" : {"POSITIVE" : bool(r["score"]), "NEGATIVE" : bool(1 - r["score"])}}) for idx, r in df_test.iterrows()]

print("Test summary...")
print('{:^7}\t{:^7}\t{:^7}\t{:^7}'.format('Acc', 'Pr', 'Rec', 'F1'))
scores = evaluate(nlp, test_prep)
print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'.format(scores['acc'], scores['prec'], scores['rec'], scores['f1']))