In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
organizations_snap_amazon_fine_food_reviews_path = kagglehub.dataset_download('organizations/snap/amazon-fine-food-reviews')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import nltk
import string
from nltk.corpus import stopwords
from tqdm import tqdm
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/amazon-fine-food-reviews/Reviews.csv', encoding='latin-1')

In [None]:
# Keep only target and text columns
df = df[['Score', 'Text']]


In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.count()

**pre processing**


In [None]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['Text'].apply(clean_text)
print(df[['Text', 'clean_text']].head())


In [None]:
def NER(text):
     doc = nlp(text)
     return [(ent.text,ent.label_) for ent in doc.ents]

df['entities'] = df['Text'].head().apply(NER)
print(df[['entities', 'Text']].head())

In [None]:
def get_sent(text):
    return TextBlob(text).sentiment.polarity
df['sentiment']=df['Text'].apply(get_sent)
print(df[['Text', 'sentiment']].head())

In [None]:
# # Map sentiment: <0 → 0 (negative), >0→ 1 (positive)
df['target'] = df['sentiment'].apply(lambda x: 1 if x > 0 else 0)
print (df[['Text','target']].head())

In [None]:
X = df['clean_text']
y = df['target']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=20000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
# Train logistic
model = LogisticRegression(max_iter=2000)
model.fit(X_train_vec, y_train)

# Predict
y_pred = model.predict(X_test_vec)

# Evaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()


In [None]:
# #train RFC
# model = RandomForestClassifier(n_estimators=200, random_state=42)
# model.fit(X_train_vec, y_train)

# # Predict
# y_pred = model.predict(X_test_vec)

# # Evaluation
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [None]:
# # Confusion Matrix
# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
# disp.plot(cmap=plt.cm.Blues)
# plt.title("Confusion Matrix")
# plt.show()


In [None]:
import joblib

# Save model and vectorizer
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


In [None]:
def predict_sentiment(text):
    cleaned = clean_text(text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)
    return 'Positive' if prediction[0] == 1 else 'Negative'

# Example
print(predict_sentiment("l love this!"))
print(predict_sentiment("taste food is bad."))
