# Classy Classification

Link to [Github Repository](https://github.com/davidberenstein1957/classy-classification).

In [None]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [None]:
data = {
    "furniture": [
        "This text is about chairs.",
        "Couches, benches and televisions.",
        "I really need to get a new sofa.",
    ],
    "kitchen": [
        "There also exist things like fridges.",
        "I hope to be getting a new stove today.",
        "Do you also have some ovens.",
    ],
}

nlp = spacy.blank("en")
nlp.add_pipe(
    "classy_classification",
    config={
        "data": data,
        "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        "device": "gpu",
    },
)

print(nlp("I am looking for kitchen appliances.")._.cats)

In [None]:
df = pd.read_csv("data/imdb-reviews.csv")
df.head()

In [None]:
# Preprocess the data
df["review"] = df["review"].str.replace("<br />", " ")

# Shuffle the data
df = shuffle(df)

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2)

# Create batches of the data for training
train_data = {
    "positive": df[df["sentiment"] == "positive"]["review"].tolist(),
    "negative": df[df["sentiment"] == "negative"]["review"].tolist(),
}

# Initialize the NLP model with the classy-classification pipeline
nlp = spacy.blank("en")
nlp.add_pipe(
    "classy_classification",
    config={
        "data": train_data,
        "model": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        "device": "gpu",
    },
)

In [None]:
# Train the model on the training data
for epoch in range(10):  # Number of epochs
    losses = {}
    for batch in spacy.util.minibatch(train_data, size=8):  # Batch size
        texts = [item["review"] for item in batch]
        cats = [{"POSITIVE": bool(item["sentiment"])} for item in batch]
        nlp.update(texts, cats, losses=losses)
    print(f"Losses at epoch {epoch}: {losses}")

# Evaluate the model on the testing data
correct_predictions = 0
for doc in test_data:
    text = doc["review"]
    true_sentiment = doc["sentiment"]
    prediction = nlp(text)._.cats["POSITIVE"]
    if (prediction >= 0.5 and true_sentiment == 1) or (
        prediction < 0.5 and true_sentiment == 0
    ):
        correct_predictions += 1
accuracy = correct_predictions / len(test_data)
print(f"Accuracy: {accuracy}")

# Use the trained model to predict the sentiment of new reviews
new_review = "This movie was really great. I enjoyed it a lot."
print(nlp(new_review)._.cats)