### Setup & Imports

In [83]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from scipy.spatial.distance import euclidean

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

### Load pre-trained objects

In [84]:
vectorizer = joblib.load("tfidf_vectorizer.pkl")            # Fitted TF-IDF vectorizer
kmeans_final = joblib.load("kmeans_model.pkl")              # Fitted MiniBatchKMeans model
threshold = joblib.load("anomaly_distance_threshold.pkl")   # Load threshold

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

### Preprocessing function

In [85]:
def preprocess_review(review_text):
    review_text_cleaned = review_text.lower()
    review_text_cleaned = re.sub(r'[^a-z\s]', '', review_text_cleaned)

    tokens = []
    for word in review_text_cleaned.split():
        if word not in stop_words:
            lemma = lemmatizer.lemmatize(word, pos='v')  # Using verb as POS
            tokens.append(lemma)
    return ' '.join(tokens)

### Prediction function

In [86]:
def predict_review(review_text):
    # Step 1: Preprocess the review
    processed_text = preprocess_review(review_text)
    
    # Step 2: Transform into TF-IDF features
    tfidf_features = vectorizer.transform([processed_text])
    
    # Step 3: Add review length feature
    review_length = len(processed_text.split())
    length_feature = csr_matrix([[review_length]])
    
    # Step 4: Combine features
    final_features = hstack([tfidf_features, length_feature])
    
    # Step 5: Predict cluster and calculate distance to centroid
    cluster_label = kmeans_final.predict(final_features)[0]
    centroid = kmeans_final.cluster_centers_[cluster_label]
    
    # Calculate Euclidean distance
    distance = euclidean(final_features.toarray().ravel(), centroid)
    
    # Step 6: Load and apply anomaly threshold
    is_anomalous = distance > threshold
    review_type = 'Anomalous' if is_anomalous else 'Normal'
    
    return review_type, cluster_label, distance, processed_text


### Test with sample reviews

In [87]:
if __name__ == "__main__":
    # Test with both normal and anomalous examples
    test_reviews = [
        "Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty",
        "love it, a great upgrade from the original.  I've had mine for a couple of years",
        "Panget",
        "Not impossible to put together by yourself. Only scratched one place in a not very noticeable place. Get many compliments on it and has lots of storage.",
    ]
    
    print("Review Prediction Results:")
    print("=" * 80)
    
    for i, review in enumerate(test_reviews, 1):
        prediction, cluster, distance, processed = predict_review(review)
        
        print(f"\nReview {i}:")
        print(f"Original: {review}")
        print(f"Processed: {processed}")
        print(f"Cluster: {cluster}, Distance: {distance:.4f}")
        print(f"Prediction: {prediction} ({'✓' if 'Anomalous' in prediction and 'terrible' in review.lower() or 'horrible' in review.lower() else '?'})")
        print("-" * 40)

Review Prediction Results:

Review 1:
Original: Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty
Processed: love well make sturdy comfortable love itvery pretty
Cluster: 0, Distance: 11.9406
Prediction: Normal (?)
----------------------------------------

Review 2:
Original: love it, a great upgrade from the original.  I've had mine for a couple of years
Processed: love great upgrade original ive mine couple years
Cluster: 0, Distance: 11.9410
Prediction: Normal (?)
----------------------------------------

Review 3:
Original: Panget
Processed: panget
Cluster: 0, Distance: 18.8998
Prediction: Normal (?)
----------------------------------------

Review 4:
Original: Not impossible to put together by yourself. Only scratched one place in a not very noticeable place. Get many compliments on it and has lots of storage.
Processed: impossible put together scratch one place noticeable place get many compliment lot storage
Cluster: 0, Distance: 6.9711
Prediction: Norm