In [None]:
!pip install wget

In [1]:
!wget https://snap.stanford.edu/data/finefoods.txt.gz
!gzip -d finefoods.txt.gz


--2023-12-05 20:58:51--  https://snap.stanford.edu/data/finefoods.txt.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 122104202 (116M) [application/x-gzip]
Saving to: ‘finefoods.txt.gz’


2023-12-05 20:58:57 (18.3 MB/s) - ‘finefoods.txt.gz’ saved [122104202/122104202]



In [7]:
with open('finefoods.txt', 'r', encoding='latin-1') as file:
    for _ in range(10):  # Print the first 10 lines
        print(file.readline())
# print only the first few lines of the file

product/productId: B001E4KFG0

review/userId: A3SGXH7AUHU8GW

review/profileName: delmartian

review/helpfulness: 1/1

review/score: 5.0

review/time: 1303862400

review/summary: Good Quality Dog Food

review/text: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.



product/productId: B00813GRG4



In [8]:
import pandas as pd

# Read the file line by line and extract relevant information
data = {'productId': [], 'userId': [], 'profileName': [], 'helpfulness': [], 'score': [], 'time': [], 'summary': [], 'text': []}

with open('finefoods.txt', 'r', encoding='latin-1') as file:
    current_review = {}
    for line in file:
        line = line.strip()
        if line:
            parts = line.split(': ', 1)
            if len(parts) == 2:
                key, value = parts
                current_review[key] = value
        else:
            # End of review, add it to the data dictionary
            data['productId'].append(current_review.get('product/productId', ''))
            data['userId'].append(current_review.get('review/userId', ''))
            data['profileName'].append(current_review.get('review/profileName', ''))
            data['helpfulness'].append(current_review.get('review/helpfulness', ''))
            data['score'].append(current_review.get('review/score', ''))
            data['time'].append(current_review.get('review/time', ''))
            data['summary'].append(current_review.get('review/summary', ''))
            data['text'].append(current_review.get('review/text', ''))
            current_review = {}

df = pd.DataFrame(data)
print(df.head())


    productId          userId                      profileName helpfulness  \
0  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian         1/1   
1  B00813GRG4  A1D87F6ZCVE5NK                           dll pa         0/0   
2  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"         1/1   
3  B000UA0QIQ  A395BORC6FGVXV                             Karl         3/3   
4  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"         0/0   

  score        time                summary  \
0   5.0  1303862400  Good Quality Dog Food   
1   1.0  1346976000      Not as Advertised   
2   4.0  1219017600  "Delight" says it all   
3   2.0  1307923200         Cough Medicine   
4   5.0  1350777600            Great taffy   

                                                text  
0  I have bought several of the Vitality canned d...  
1  Product arrived labeled as Jumbo Salted Peanut...  
2  This is a confection that has been around a fe...  
3  If you are looking for the secr

 * UTF-8 encoding format if you have a large collection of foreign language resources because it allows you to access a greater number of foreign characters.


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Combine review/summary and review/text columns
df['text'] = df['summary'] + ' ' + df['text']

# Features and target variable
X = df[['userId', 'profileName', 'time', 'text']]
y = df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = CountVectorizer()
X_train_text = vectorizer.fit_transform(X_train['text'])
X_test_text = vectorizer.transform(X_test['text'])

# Train Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_text, y_train)

# predictions
y_pred = nb_classifier.predict(X_test_text)

# model evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:}")
print("Classification Report:")
print(report)

Accuracy: 0.7194149053135253
Classification Report:
              precision    recall  f1-score   support

         1.0       0.59      0.69      0.63     10326
         2.0       0.43      0.29      0.35      5855
         3.0       0.44      0.38      0.41      8485
         4.0       0.41      0.41      0.41     16123
         5.0       0.85      0.87      0.86     72902

    accuracy                           0.72    113691
   macro avg       0.54      0.53      0.53    113691
weighted avg       0.71      0.72      0.72    113691



Now cleaning the Data

In [10]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english')) | ENGLISH_STOP_WORDS
lemmatizer = WordNetLemmatizer()

def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def clean_text(text):
    text = text.lower()    # Convert to lowercase
    text = remove_html_tags(text)    # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)    # Remove special characters, numbers, and punctuation

    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into text
    text = ' '.join(tokens)

    return text

# Apply text cleaning to 'text' column in the DataFrame
df['text'] = df['text'].apply(clean_text)

X = df[['userId', 'profileName', 'time', 'text']]
y = df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_text = vectorizer.fit_transform(X_train['text'])
X_test_text = vectorizer.transform(X_test['text'])

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_text, y_train)

y_pred = nb_classifier.predict(X_test_text)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:}")
print("Classification Report:")
print(report)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  soup = BeautifulSoup(text, 'html.parser')


Accuracy: 0.7342533709792332
Classification Report:
              precision    recall  f1-score   support

         1.0       0.63      0.66      0.65     10326
         2.0       0.64      0.14      0.23      5855
         3.0       0.52      0.26      0.34      8485
         4.0       0.45      0.36      0.40     16123
         5.0       0.80      0.93      0.86     72902

    accuracy                           0.73    113691
   macro avg       0.61      0.47      0.50    113691
weighted avg       0.71      0.73      0.71    113691



The data is extracted from the The Amazon Fine Foods reviews dataset. In order to predict review scores, important features were extracted. 'userId' and 'profileName' were selected in order to comprehend score patterns linked to certain individuals or profiles. Examining temporal trends and determining if sentiment evolved over time was made possible by the 'time' feature, which represented review timestamps. The classifier received a comprehensive input from the 'text' feature, which combined'review/summary' and'review/text',combining summary and detailed text.


Then i cleaned the data.In the beginning, HTML tags were removed in order to concentrate just on content. Lowercasing ensured that words were treated consistently, and removing punctuation, special characters, and digits made the data easier to understand. Text was divided into meaningful units by tokenization, and more significant terms were given priority when common stop words were eliminated. Lemmatization helped with uniform treatment by reducing words to their most basic form and dimensionality.

There were notable gains when the cleaned text was used as the main input for the Naive Bayes Classifier. Model accuracy increased from 71.9% to 73.4%, indicating that the data cleaning procedure had a beneficial effect.