In [17]:
import numpy as np
import pandas as pd
import pickle
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# CSV file
data = pd.read_csv('/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/A2_Data.csv')

# fetching the reviews from the csv file (3rd column)
rev_text = data.iloc[:, 2].tolist()

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()
# Initialize Porter stemmer
stemmer = PorterStemmer()

# preprocessing function
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')

    # handling NaN reviews
    if pd.isnull(text):
        return

    # lowercase reviews
    text = text.lower()

    # parse HTML content and extract text
    text = BeautifulSoup(text,'html.parser').get_text()

    # tokenization
    tokens = nltk.word_tokenize(text)

    # stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    # removing stopwords
    filtered_tokens = [word for word in lemmatized_tokens if word not in stop_words]

    # removing punctuation
    non_punct_tokens = tokenizer.tokenize(' '.join(filtered_tokens))

    # removing blank space tokens
    cleaned_tokens = [word for word in non_punct_tokens if not word.isspace()]

    return cleaned_tokens

# perform preprocessing on each review
preprocessed_reviews = [preprocess(review) for review in rev_text]

# create a new DataFrame with original columns and preprocessed reviews
new_data = data.copy()
new_data['Preprocessed_Review'] = preprocessed_reviews

# save the new DataFrame as a pickle file
fp = "/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/preprocessed_data.pkl"
with open(fp, "wb") as f:
    pickle.dump(new_data, f)

# print the first 10 preprocessed reviews
print("First 5 preprocessed reviews:")
for i in range(5):
    print(preprocessed_reviews[i])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  text = BeautifulSoup(text,'html.parser').get_text()


First 5 preprocessed reviews:
['love', 'vintag', 'spring', 'vintag', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 'spring', 'way', 'go']
['work', 'great', 'guitar', 'bench', 'mat', 'rug', 'enough', 'abus', 'take', 'care', 'take', 'care', 'make', 'organ', 'workspac', 'much', 'easier', 'becaus', 'screw', 'wo', 'n', 't', 'roll', 'around', 'color', 'good']
['use', 'everyth', 'acoust', 'bass', 'ukulel', 'know', 'smaller', 'model', 'avail', 'uke', 'violin', 'etc', 'n', 't', 'yet', 'order', 'work', 'smaller', 'instrument', 'one', 'doe', 'n', 't', 'extend', 'foot', 'maximum', 'width', 're', 'gentl', 'instrument', 'grippi', 'materi', 'keep', 'secur', 'greatest', 'benefit', 'ha', 'write', 'music', 'comput', 'need', 'set', 'guitar', 'use', 'keyboard', 'mous', 'easier', 'hang', 'stand', 'sever', 'gave', 'one', 'friend', 'christma', 'well', 've', 'use', 'mine', 'stage', 'fold', 'small', 'enough', 'fit', 'right', 'gig', 'bag']
['great', 'price', 'good', 'qualiti', 'n', 't

In [18]:
import numpy as np
import pandas as pd
import pickle
import math
from collections import Counter

def create_vocab(preprocessed_docs):
    unique_words = set()
    if isinstance(preprocessed_docs, list):
        for doc in preprocessed_docs:
            if doc is not None:
                # iterate through each word in the document
                for word in doc:
                    # add unique words to the set
                    unique_words.add(word)
        # sort unique words alphabetically and assign index to each word
        unique_words = sorted(list(unique_words))
        vocabulary = {word: idx for idx, word in enumerate(unique_words)}
        return vocabulary
    else:
        print("incorrect format")

def calculate_idf(unique_words, preprocessed_docs):
    idf_dictionary = {}
    num_docs = len(preprocessed_docs)
    for word in unique_words:
        # count the number of docs containing the word
        count = sum(1 for doc in preprocessed_docs if doc and word in doc)
        # compute IDF value for each token
        idf_dictionary[word] = float(1 + math.log((num_docs + 1) / (count + 1)))
    return idf_dictionary

def convert(preprocessed_docs, vocabulary, idf_dictionary):
    if isinstance(preprocessed_docs, list):
        tfidf_matrix = np.zeros((len(preprocessed_docs), len(vocabulary)))
        for idx, doc in enumerate(preprocessed_docs):
            if doc is not None:
                # compute tf for each word
                word_frequency = dict(Counter(doc))
                for word, freq in word_frequency.items():
                    col_index = vocabulary.get(word, -1)
                    if col_index != -1:
                        # calculate TF-IDF value
                        tf = freq / float(len(preprocessed_docs))
                        idf_value = idf_dictionary[word]
                        tfidf_matrix[idx, col_index] = tf * idf_value
        # L2 normalization
        norms = np.linalg.norm(tfidf_matrix, axis=1)[:, np.newaxis]
        # find indices where norms are zero
        zero_indices = np.where(norms == 0)[0]
        # replace zero norms with 1 to avoid division by zero
        norms[zero_indices] = 1
        # replace NaN norms with 0
        norms[np.isnan(norms)] = 0
        tfidf_matrix /= norms
        return tfidf_matrix
    else:
        print("incorrect format")

# load the data from file
data_frame = pd.read_pickle("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/preprocessed_data.pkl")

# fetch the preprocessed reviews from the 'Preprocessed_Review' column
preprocessed_docs = data_frame['Preprocessed_Review'].tolist()

# Step 1: create the vocabulary from a dataset of documents
vocab = create_vocab(preprocessed_docs)

# Step 2: calculate IDF
idf_dict = calculate_idf(list(vocab.keys()), preprocessed_docs)

# Step 3: convert to TF-IDF matrix
tfidf_matrix = convert(preprocessed_docs, vocab, idf_dict)

tfidf_vectors = [','.join(map(str, row)) for row in tfidf_matrix]

# add the merged TF-IDF vectors as a new column in the DataFrame
data_frame['TF-IDF'] = tfidf_vectors

# save the modified DataFrame with TF-IDF vectors as a new pickle file
data_frame.to_pickle("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/tfidf.pkl")


In [19]:

review_idx = 0  # Index of the first review
review = preprocessed_reviews[review_idx]
for word in review:
    if word in vocab:
        idx = vocab[word]
        tf_idf = tfidf_matrix[review_idx, idx]
        print(f"Review {review_idx + 1}, Word: '{word}', TF-IDF: {tf_idf:.4f}")

Review 1, Word: 'love', TF-IDF: 0.1476
Review 1, Word: 'vintag', TF-IDF: 0.4783
Review 1, Word: 'spring', TF-IDF: 0.5616
Review 1, Word: 'vintag', TF-IDF: 0.4783
Review 1, Word: 'strat', TF-IDF: 0.1976
Review 1, Word: 'good', TF-IDF: 0.1241
Review 1, Word: 'tension', TF-IDF: 0.2751
Review 1, Word: 'great', TF-IDF: 0.1028
Review 1, Word: 'stabil', TF-IDF: 0.2751
Review 1, Word: 'float', TF-IDF: 0.3142
Review 1, Word: 'bridg', TF-IDF: 0.2097
Review 1, Word: 'want', TF-IDF: 0.1516
Review 1, Word: 'spring', TF-IDF: 0.5616
Review 1, Word: 'way', TF-IDF: 0.1736
Review 1, Word: 'go', TF-IDF: 0.1476


In [20]:
import pandas as pd1
data = pd1.read_pickle("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/tfidf.pkl")


# print first 5 rows along with column names
print(data.head())

   Unnamed: 0                                              Image  \
0        3452  ['https://images-na.ssl-images-amazon.com/imag...   
1        1205  ['https://images-na.ssl-images-amazon.com/imag...   
2        1708  ['https://images-na.ssl-images-amazon.com/imag...   
3        2078  ['https://images-na.ssl-images-amazon.com/imag...   
4         801  ['https://images-na.ssl-images-amazon.com/imag...   

                                         Review Text  \
0  Loving these vintage springs on my vintage str...   
1  Works great as a guitar bench mat. Not rugged ...   
2  We use these for everything from our acoustic ...   
3  Great price and good quality.  It didn't quite...   
4  I bought this bass to split time as my primary...   

                                 Preprocessed_Review  \
0  [love, vintag, spring, vintag, strat, good, te...   
1  [work, great, guitar, bench, mat, rug, enough,...   
2  [use, everyth, acoust, bass, ukulel, know, sma...   
3  [great, price, good, qualit

In [21]:
import numpy as np
import pandas as pd
import pickle
import nltk
import gzip
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

with open("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/extracted_features.pkl", "rb") as f:
    data = pickle.load(f)

# Convert NumPy array to DataFrame
data_df = pd.DataFrame(data)

reviews = data_df['review']

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()
# Initialize Porter stemmer
stemmer = PorterStemmer()

# preprocessing function
def preprocess(text):
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')

    # handling NaN reviews
    if pd.isnull(text):
        return

    # lowercase reviews
    text = text.lower()

    # parse HTML content and extract text
    text = BeautifulSoup(text,'html.parser').get_text()

    # tokenization
    tokens = nltk.word_tokenize(text)

    # stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    # removing stopwords
    filtered_tokens = [word for word in lemmatized_tokens if word not in stop_words]

    # removing punctuation
    non_punct_tokens = tokenizer.tokenize(' '.join(filtered_tokens))

    # removing blank space tokens
    cleaned_tokens = [word for word in non_punct_tokens if not word.isspace()]

    return cleaned_tokens

# perform preprocessing on each review
preprocessed_reviews = [preprocess(review) for review in reviews]

# create a new DataFrame with original columns and preprocessed reviews
data_df['Preprocessed_Review'] = preprocessed_reviews

# save the new DataFrame as a pickle file
fp = "/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/new_preprocessed_data.pkl"
with open(fp, "wb") as f:
    pickle.dump(data_df, f)

# print the first 10 preprocessed reviews
print("First 50 preprocessed reviews:")
for i in range(50):
    print(preprocessed_reviews[i])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  text = BeautifulSoup(text,'html.parser').get_text()


First 50 preprocessed reviews:
['love', 'vintag', 'spring', 'vintag', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 'spring', 'way', 'go']
['work', 'great', 'guitar', 'bench', 'mat', 'rug', 'enough', 'abus', 'take', 'care', 'take', 'care', 'make', 'organ', 'workspac', 'much', 'easier', 'becaus', 'screw', 'wo', 'n', 't', 'roll', 'around', 'color', 'good']
['work', 'great', 'guitar', 'bench', 'mat', 'rug', 'enough', 'abus', 'take', 'care', 'take', 'care', 'make', 'organ', 'workspac', 'much', 'easier', 'becaus', 'screw', 'wo', 'n', 't', 'roll', 'around', 'color', 'good']
['work', 'great', 'guitar', 'bench', 'mat', 'rug', 'enough', 'abus', 'take', 'care', 'take', 'care', 'make', 'organ', 'workspac', 'much', 'easier', 'becaus', 'screw', 'wo', 'n', 't', 'roll', 'around', 'color', 'good']
['use', 'everyth', 'acoust', 'bass', 'ukulel', 'know', 'smaller', 'model', 'avail', 'uke', 'violin', 'etc', 'n', 't', 'yet', 'order', 'work', 'smaller', 'instrument', 'one', 'doe',

In [22]:
import numpy as np
import pandas as pd
import pickle
import math
from collections import Counter

def create_vocab(preprocessed_reviews):
    uniq_words = set()
    if isinstance(preprocessed_reviews, list):
        for doc in preprocessed_reviews:
            if doc is not None:
                # iterate through each word in the document
                for word in doc:
                    # add unique words to the set
                    uniq_words.add(word)
        # sort unique words alphabetically and assign index to each word
        uniq_words = sorted(list(uniq_words))
        vocab = {word: idx for idx, word in enumerate(uniq_words)}
        return vocab
    else:
        print("incorrect format")

def calculate_idf(uniq_words, preprocessed_reviews):
    idf_dict = {}
    num = len(preprocessed_reviews)
    for word in uniq_words:
        # count the number of docs containing the word
        count = sum(1 for doc in preprocessed_reviews if doc and word in doc)
        # compute IDF value for each token
        idf_dict[word] = float(1 + math.log((num + 1) / (count + 1)))
    return idf_dict

def convert(preprocessed_reviews, vocab, idf_dict):
    if isinstance(preprocessed_reviews, list):
        tfidf_matrix = np.zeros((len(preprocessed_reviews), len(vocab)))
        for idx, doc in enumerate(preprocessed_reviews):
            if doc is not None:
                # compute tf for each word
                word_freq = dict(Counter(doc))
                for word, freq in word_freq.items():
                    col_index = vocab.get(word, -1)
                    if col_index != -1:
                        # calculate TF-IDF value
                        tf = freq / float(len(preprocessed_reviews))
                        idf_ = idf_dict[word]
                        tfidf_matrix[idx, col_index] = tf * idf_
        # L2 normalization
        norms = np.linalg.norm(tfidf_matrix, axis=1)[:, np.newaxis]
        # find indices where norms are zero
        zero_indices = np.where(norms == 0)[0]
        # replace zero norms with 1 to avoid division by zero
        norms[zero_indices] = 1
        # replace NaN norms with 0
        norms[np.isnan(norms)] = 0
        tfidf_matrix /= norms
        return tfidf_matrix
    else:
        print("incorrect format")

# load the data from file
data = pd.read_pickle("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/new_preprocessed_data.pkl")

# fetch the preprocessed reviews from the 'Preprocessed_Review' column
preprocessed_reviews = data['Preprocessed_Review'].tolist()

# Step 1: create the vocabulary from a dataset of documents
vocab = create_vocab(preprocessed_reviews)

# Step 2: calculate IDF
idf_dict = calculate_idf(list(vocab.keys()), preprocessed_reviews)

# Step 3: convert to TF-IDF matrix
tfidf_matrix = convert(preprocessed_reviews, vocab, idf_dict)

tfidf_vectors = [','.join(map(str, row)) for row in tfidf_matrix]
print(tfidf_vectors)
# Add the merged TF-IDF vectors as a new column in the DataFrame
data['TF-IDF'] = tfidf_vectors



review_idx = 0  # Index of the first review
review = preprocessed_reviews[review_idx]
for word in review:
    if word in vocab:
        idx = vocab[word]
        tf_idf = tfidf_matrix[review_idx, idx]
        print(f"Review {review_idx + 1}, Word: '{word}', TF-IDF (Normalized): {tf_idf:.4f}")


# Save the modified DataFrame with TF-IDF vectors as a new pickle file
data.to_pickle("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/new_tfidf.pkl")



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
import pandas as pd1
data = pd1.read_pickle("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/new_tfidf.pkl")


# print first 5 rows along with column names
print(data.head())

       id                                          image_url  \
0  3452.0  https://images-na.ssl-images-amazon.com/images...   
1  1205.0  https://images-na.ssl-images-amazon.com/images...   
2  1205.0  https://images-na.ssl-images-amazon.com/images...   
3  1205.0  https://images-na.ssl-images-amazon.com/images...   
4  1708.0  https://images-na.ssl-images-amazon.com/images...   

                                              review  \
0  Loving these vintage springs on my vintage str...   
1  Works great as a guitar bench mat. Not rugged ...   
2  Works great as a guitar bench mat. Not rugged ...   
3  Works great as a guitar bench mat. Not rugged ...   
4  We use these for everything from our acoustic ...   

                                 normalized_features  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1  [0.0, 0.0, 0.0, 0.0, 0.33417895, 0.0, 0.0, 0.4...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.23952477...   
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  

In [24]:
import pandas as pd

data = pd.read_pickle("/content/drive/MyDrive/CSE508_Winter2024_A2_MT23029/new_tfidf.pkl")


# Print last column of the first row of the pickle file
print(data.iloc[0, -1])


0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,