In [1]:
pip install pandas nltk


[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Adjust this path to your JSON file
PATH_TO_JSON = '/Users/sarveshsahasrabudhe/Documents/HCI/Project_Phase_2_Ssahasrabudhe/public/data/Amazon_Appliances_Reviews.json'

def load_data():
    # Load data from a JSON file
    return pd.read_json(PATH_TO_JSON)

# Load reviews data
df = load_data()

# Convert UNIX timestamp to year if necessary
if 'unixReviewTime' in df.columns:
    df['Year'] = pd.to_datetime(df['unixReviewTime'], unit='s').dt.year
elif 'reviewTime' in df.columns:
    df['Year'] = pd.to_datetime(df['reviewTime']).dt.year  # Assuming 'reviewTime' is already in a suitable format

# Filter reviews between 2011 and 2014
df = df[df['Year'].isin([2011, 2012, 2013, 2014])]

# Removing stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = ' '.join([word for word in word_tokens if not word in stop_words])
    return filtered_text

df['reviewText'] = df['reviewText'].apply(remove_stopwords)

# Calculate TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['reviewText'])

# Calculate similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

# Get top 3 similar reviews for each year
results = {}
for year in [2011, 2012, 2013, 2014]:
    year_mask = df['Year'] == year
    if year_mask.any():
        indices = df.index[year_mask].tolist()
        year_similarities = similarity_matrix[indices, :][:, indices]
        for idx, index in enumerate(indices):
            sorted_indices = year_similarities[idx].argsort()[::-1][1:4]  # Get indices of top 3 similar, excluding self
            top_reviews = df.iloc[sorted_indices]
            results.setdefault(year, []).append({
                'Base Review': df.at[index, 'reviewText'],
                'Top Similar Reviews': top_reviews['reviewText'].tolist()
            })

# Save results to a JSON file
with open('top_similar_reviews_by_year.json', 'w') as f:
    json.dump(results, f, indent=4)

print("Top similar reviews saved to 'top_similar_reviews_by_year.json'")


In [None]:
import json
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the JSON file
file_path = '/Users/sarveshsahasrabudhe/Documents/HCI/Project_Phase_2_Ssahasrabudhe/public/data/Amazon_Appliances_Reviews.json'
with open(file_path, 'r') as file:
    reviews = json.load(file)

# Group reviews by year
reviews_by_year = defaultdict(list)
for review in reviews:
    year = review['reviewTime'][-4:]
    reviews_by_year[year].append(review)

# Define function to find top 3 most similar reviews
def top_3_similar_reviews(reviews):
    texts = [review['reviewText'] for review in reviews]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    top_3_similar_indices = []
    for i, similarity_row in enumerate(similarities):
        # Exclude self-similarity
        similar_indices = sorted(range(len(similarity_row)), key=lambda x: similarity_row[x], reverse=True)[1:4]
        top_3_similar_indices.append(similar_indices)
    return top_3_similar_indices

# Find top 3 most similar reviews for each year
for year, reviews in reviews_by_year.items():
    print(f"Year: {year}")
    top_3_indices = top_3_similar_reviews(reviews)
    for i, indices in enumerate(top_3_indices):
        print(f"Similar Reviews {i+1}:")
        for index in indices:
            print(reviews[index]['reviewText'])
        print()


Year: 2013


In [None]:
import json
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to group reviews by year
def group_reviews_by_year(reviews):
    reviews_by_year = defaultdict(list)
    for review in reviews:
        year = int(review['reviewTime'][-4:])
        if year in [2011, 2012, 2013, 2014]:
            reviews_by_year[year].append(review)
    return reviews_by_year

# Function to find top 3 most similar reviews for each year
def top_3_similar_reviews(reviews):
    texts = [review['reviewText'] for review in reviews]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    top_3_similar_indices = []
    for i, similarity_row in enumerate(similarities):
        similar_indices = sorted(range(len(similarity_row)), key=lambda x: similarity_row[x], reverse=True)[1:4]
        top_3_similar_indices.append(similar_indices)
    return top_3_similar_indices

# Read the JSON file
with open('Amazon_Appliances_Reviews.json', 'r') as file:
    reviews = json.load(file)

# Group reviews by year
reviews_by_year = group_reviews_by_year(reviews)

# Dictionary to store results
results = {}

# Find and store top 3 most similar reviews for each year
for year, reviews in reviews_by_year.items():
    similar_reviews = []
    top_3_indices = top_3_similar_reviews(reviews)
    for indices in top_3_indices:
        similar_reviews.append([reviews[index]['reviewText'] for index in indices])
    results[year] = similar_reviews

# Save results to a JSON file
with open('similar_reviews_results.json', 'w') as file:
    json.dump(results, file, indent=4)
