In [12]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random
import pandas as pd

# Step 1: Load a blank SpaCy model
nlp = spacy.blank("en")

# Step 2: Add the text classification pipeline component to the model
if "textcat" not in nlp.pipe_names:
    textcat = nlp.add_pipe("textcat", last=True)
else:
    textcat = nlp.get_pipe("textcat")

# Step 3: Add labels for sentiment classification (positive, neutral, negative)
textcat.add_label("POSITIVE")
textcat.add_label("NEUTRAL")
textcat.add_label("NEGATIVE")

# Define a function to map star ratings to sentiment categories
def map_rating_to_label(rating):
    if rating >= 4:  # 4-5 stars as POSITIVE
        return {"POSITIVE": 1, "NEUTRAL": 0, "NEGATIVE": 0}
    elif rating == 3:  # 3 stars as NEUTRAL
        return {"POSITIVE": 0, "NEUTRAL": 1, "NEGATIVE": 0}
    else:  # 1-2 stars as NEGATIVE
        return {"POSITIVE": 0, "NEUTRAL": 0, "NEGATIVE": 1}

# Step 4: Load the list of file paths
# Assuming you have a file `file_paths.txt` that contains paths to CSV files, one per line
with open('csv_Files', 'r') as file:
    file_paths = [line.strip() for line in file]

# Initialize an empty list for storing training data
train_data = []

# Loop through each file path and process the data
for file_path in file_paths:
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Clean and process the data
    df = df.dropna(subset=['Review Text', 'Rating'])  # Drop rows with missing review text or rating
    df['Review Text'] = df['Review Text'].astype(str)  # Ensure all review texts are strings

    # Iterate through each row in the DataFrame and process the review text and rating
    for index, row in df.iterrows():
        text = row['Review Text']
        rating = row['Rating']
        cats = map_rating_to_label(rating)
        train_data.append((text, {"cats": cats}))

# Step 5: Train the model with the combined data
# Disable other pipeline components to only train the textcat component
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes):  # Only train textcat
    optimizer = nlp.begin_training()
    for epoch in range(10):  # Train for 10 epochs
        random.shuffle(train_data)
        losses = {}
        # Use minibatch training
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                examples.append(Example.from_dict(doc, annotations))
            nlp.update(examples, drop=0.2, losses=losses)
        print(f"Epoch {epoch + 1} Loss: {losses['textcat']}")

# Step 6: Test the model with new examples
test_texts = [
    "I absolutely love this!",
    "This is terrible and I hate it.",
    "I am not sure how I feel about this.",
]

for text in test_texts:
    doc = nlp(text)
    print(text, doc.cats)  # Returns a dictionary with POSITIVE/NEUTRAL/NEGATIVE scores

# Step 7: Save the trained model to a directory
nlp.to_disk("sentiment_model")

# To load the model later for use
# nlp2 = spacy.load("sentiment_model")

Epoch 1 Loss: 301.1163737233728
Epoch 2 Loss: 278.5386562105268
Epoch 3 Loss: 268.6856988798827
Epoch 4 Loss: 257.68537754862336
Epoch 5 Loss: 252.8323963517323
Epoch 6 Loss: 241.64122542715631
Epoch 7 Loss: 233.75642627407797
Epoch 8 Loss: 223.726341615431
Epoch 9 Loss: 217.585253006313
Epoch 10 Loss: 209.12853974406607
I absolutely love this! {'POSITIVE': 0.9106165766716003, 'NEUTRAL': 0.08803308010101318, 'NEGATIVE': 0.001350310631096363}
This is terrible and I hate it. {'POSITIVE': 0.7780562043190002, 'NEUTRAL': 0.2100001871585846, 'NEGATIVE': 0.01194359827786684}
I am not sure how I feel about this. {'POSITIVE': 0.2442249208688736, 'NEUTRAL': 0.30260375142097473, 'NEGATIVE': 0.4531712532043457}
