<h1>Machine Learning Fundamental:<br>
Sentiment Analysis</h1>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

<h3>Prepare a Dataset</h3>

In [4]:
data = {
    'review': [
        "This product is amazing! I love it so much.",
        "The shipping was slow and the quality is terrible.",
        "Good value for the price, but not outstanding.",
        "I'm very happy with my purchase.",
        "Would not recommend this at all, total disappointment.",
        "It's okay, nothing special.",
        "Absolutely fantastic. Exceeded my expectations.",
        "Worst experience ever. It broke after one use.",
        "Pleasantly surprised by the performance.",
        "The instructions were confusing and incomplete.",
        "Highly satisfied with the customer service.",
        "This is a total scam, don't buy it.",
    ],
    'sentiment': [
        'positive', 'negative', 'neutral', 'positive', 'negative', 'neutral',
        'positive', 'negative', 'positive', 'negative', 'positive', 'negative'
    ]
}
df = pd.DataFrame(data)
df

Unnamed: 0,review,sentiment
0,This product is amazing! I love it so much.,positive
1,The shipping was slow and the quality is terri...,negative
2,"Good value for the price, but not outstanding.",neutral
3,I'm very happy with my purchase.,positive
4,"Would not recommend this at all, total disappo...",negative
5,"It's okay, nothing special.",neutral
6,Absolutely fantastic. Exceeded my expectations.,positive
7,Worst experience ever. It broke after one use.,negative
8,Pleasantly surprised by the performance.,positive
9,The instructions were confusing and incomplete.,negative


<h3>Preprocessing and Feature Extraction using Bag-of-Words</h3>

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)

# Feature Extraction with TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=1000)

# 'fit_transform' on training data
X_train_vec = vectorizer.fit_transform(X_train)
# 'transform' on test data using the vocabulary learned from the training data
X_test_vec = vectorizer.transform(X_test)

print("TF-IDF matrix created.")
print(f"Shape of training data matrix: {X_train_vec.shape}")

TF-IDF matrix created.
Shape of training data matrix: (8, 26)


<h3>Model Training</h3>

In [7]:
model = LogisticRegression(max_iter=1000)

# 'fit' the model with our vectorized training data and labels
model.fit(X_train_vec, y_train)

<h3>Prediction and Evaluation using Precision, Recall and F1-Score</h3>

In [8]:
# Use the trained model to predict on the test data.
y_pred = model.predict(X_test_vec)

# The classification_report provides precision, recall, and F1-score for each class.
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

    negative       0.25      1.00      0.40         1
    positive       1.00      0.00      0.00         3

    accuracy                           0.25         4
   macro avg       0.62      0.50      0.20         4
weighted avg       0.81      0.25      0.10         4



<h3>Testing with Different Data</h3>

In [9]:
new_review = ["The movie was a complete failure."]
new_review_vec = vectorizer.transform(new_review)
new_review_prediction = model.predict(new_review_vec)
print(f"Prediction for '{new_review[0]}': {new_review_prediction[0]}")

Prediction for 'The movie was a complete failure.': negative
