In [3]:
# 📦 Install dependencies
!pip install scikit-learn pandas numpy



In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'amazon-fine-food-reviews' dataset.
Path to dataset files: /kaggle/input/amazon-fine-food-reviews


In [7]:


# 📘 Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 📂 Load dataset (public pre-cleaned version)
url = "/kaggle/input/amazon-fine-food-reviews/Reviews.csv"
data = pd.read_csv(url)

# 🧹 Clean and prepare
data = data[['Text', 'Score']].dropna()
data = data[data['Score'] != 3]  # remove neutral reviews
data['Sentiment'] = data['Score'].apply(lambda x: 1 if x > 3 else 0)

# 🔎 Sample smaller subset for speed
data = data.sample(6000, random_state=42)

# 🧠 Split data
X_train, X_test, y_train, y_test = train_test_split(
    data['Text'], data['Sentiment'], test_size=0.2, random_state=42
)

# ✨ TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 🧩 Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC()
}

# 📊 Evaluate models
results = []
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred)
    })

# 🧾 Show results
results_df = pd.DataFrame(results)
print(results_df)


                 Model  Accuracy  Precision    Recall  F1-score
0  Logistic Regression  0.865000   0.865004  0.995054  0.925483
1          Naive Bayes  0.843333   0.843203  1.000000  0.914932
2                  SVM  0.887500   0.908582  0.963403  0.935190
