## Datapreprocessing and loading

In [1]:
import json

def load_json(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line) for line in file]
    return data
file_path = 'Cell_Phones_and_Accessories_5.json'
reviews_data = load_json(file_path)
# Storing in a list of dictionaries
reviews_list = []
for x in reviews_data:
    reviews_list.append(x)
for i, item in enumerate(reviews_list, start=1):
    print(f"Item {i}: {x}")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [2]:
num_reviews = len(reviews_list)
print("Number of reviews:", num_reviews) 

sample_item = reviews_data[0]  
columns = list(sample_item.keys())  #to get columns of the data
print("Columns of the dataset:", columns)

# Filter dataset to retain necessary columns 
filtered_reviews = [{'reviewText': review['reviewText'], 'summary': review['summary'], 'overall': review['overall']} for review in reviews_data]
print(filtered_reviews)

Number of reviews: 194439
Columns of the dataset: ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 'unixReviewTime', 'reviewTime']


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [3]:
#Creating a list of neccessary columns only
reviews_subset = [{'reviewText': item['reviewText'], 'summary': item['summary'], 'overall': item['overall']} for item in reviews_data]
for i, review in enumerate(reviews_subset[:10], start=1):
    review_text = review['reviewText']
    summary = review['summary']
    overall = review['overall']
    print(f"Review {i}:\nReview Text: {review_text}\nSummary: {summary}\nOverall: {overall}\n")

Review 1:
Review Text: They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again
Summary: Looks Good
Overall: 4.0

Review 2:
Review Text: These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)
Summary: Really great product.
Overall: 5.0

Review 3:
Review Text: These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!
Summary: LOVE LOVE LOVE
Overall: 5.0

Review 4:
Review Text: Item arrived in great time and was in perfect condition. However, I ordered these buttons because they were a great deal and included a FREE screen protector. I never received one. Though its not a big deal, it would've been nice to get it since they claim it comes with on

## Removing stopwords and punctuation

In [4]:
import string
import re 
#function to remove stopwords
def remove_stopwords(text):
    stop_words = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about",
    "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up",
    "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when",
    "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor",
    "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
}
    
    # Spliting  text into words
    words = text.split()
    # Retain only non-stop words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Joining the nonstop words back into a single string
    return ' '.join(filtered_words)

def remove_stopwordsfromreviews(reviews_subset):
    for review in reviews_subset:
        if 'reviewText' in review:
            review['reviewText'] = remove_stopwords(review['reviewText'])
        if 'summary' in review:
            review['summary'] = remove_stopwords(review['summary'])
# Removing stop words 
remove_stopwordsfromreviews(reviews_subset)

# function to remove punctuation
def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text 

# Removing punctuation 
for review in reviews_subset:
    if 'reviewText' in review:
        review['reviewText'] = remove_punctuation(review['reviewText'])
    if 'summary' in review:
        review['summary'] = remove_punctuation(review['summary'])

for i, review in enumerate(reviews_subset[:10], start=1):
    print(f"Review {i}: {review}")

Review 1: {'reviewText': 'look good stick good dont like rounded shape always bumping siri kept popping irritating wont buy product like', 'summary': 'looks good', 'overall': 4.0}
Review 2: {'reviewText': 'stickers work like review says do stick great stay phone super stylish share sister ', 'summary': 'really great product', 'overall': 5.0}
Review 3: {'reviewText': 'awesome make phone look stylish used one far almost year believe that one year great quality', 'summary': 'love love love', 'overall': 5.0}
Review 4: {'reviewText': 'item arrived great time perfect condition however ordered buttons great deal included free screen protector never received one though big deal wouldve nice get since claim comes one', 'summary': 'cute', 'overall': 4.0}
Review 5: {'reviewText': 'awesome stays on looks great used multiple apple products especially nails helps elevated key', 'summary': 'leopard home button sticker iphone 4s', 'overall': 5.0}
Review 6: {'reviewText': 'make using home button easy d

## Thematic analysis

In [5]:
def divide_reviewsbyrating(reviews):
    positive_reviews = []
    negative_reviews = []         #function to divide positive and negative reviews on the basis of rating

    for review in reviews:
        if review['overall'] >= 4:
            positive_reviews.append(review['reviewText'])
        elif review['overall'] <= 3:
            negative_reviews.append(review['reviewText'])

    return positive_reviews, negative_reviews

positive_reviews, negative_reviews = divide_reviewsbyrating(reviews_subset)

print("Number of Positive Reviews:", len(positive_reviews))
print("Number of Negative Reviews:", len(negative_reviews))


Number of Positive Reviews: 148657
Number of Negative Reviews: 45782


In [6]:
def create_bigrams(reviews):
    all_bigrams = []
    for review in reviews:
        words = review.lower().split()
        # Creating bigrams
        review_bigrams = [(words[i], words[i+1]) for i in range(len(words) - 1)]
        all_bigrams.extend(review_bigrams)
    return all_bigrams

# Creating bigrams for positive and negative reviews separately
positive_bigrams = create_bigrams(positive_reviews)
negative_bigrams = create_bigrams(negative_reviews)

print("Positive Bigrams:")
print(positive_bigrams[:15])  
print("\nNegative Bigrams:")
print(negative_bigrams[:15]) 


Positive Bigrams:
[('look', 'good'), ('good', 'stick'), ('stick', 'good'), ('good', 'dont'), ('dont', 'like'), ('like', 'rounded'), ('rounded', 'shape'), ('shape', 'always'), ('always', 'bumping'), ('bumping', 'siri'), ('siri', 'kept'), ('kept', 'popping'), ('popping', 'irritating'), ('irritating', 'wont'), ('wont', 'buy')]

Negative Bigrams:
[('make', 'using'), ('using', 'home'), ('home', 'button'), ('button', 'easy'), ('easy', 'daughter'), ('daughter', 'like'), ('like', 'them'), ('them', 'would'), ('would', 'purchase'), ('purchase', 'again'), ('again', 'well'), ('well', 'worth'), ('worth', 'price'), ('worked', 'first'), ('first', 'week')]


In [7]:
def count_bigram_frequency(reviews):
    bigram_frequency = {}
    for review in reviews:
        words = review.lower().split()
        # Creating bigrams
        review_bigrams = [(words[i], words[i+1]) for i in range(len(words) - 1)]
        # Counting bigram frequency
        for bigram in review_bigrams:
            bigram_frequency[bigram] = bigram_frequency.get(bigram, 0) + 1
    return bigram_frequency
# Counting frequency of bigrams in positive and negative reviews
positive_bigram_frequency = count_bigram_frequency(positive_reviews)
negative_bigram_frequency = count_bigram_frequency(negative_reviews)
positive_bigram_frequency_list = list(positive_bigram_frequency.items())
negative_bigram_frequency_list = list(negative_bigram_frequency.items())

x = positive_bigram_frequency_list[:10]
print("Positive Bigram Frequency:")
for bigram, frequency in x:
    print(bigram, ":", frequency)
    
y= negative_bigram_frequency_list[:10]
print("Negative Bigram Frequency:")
for bigram, frequency in y:
    print(bigram, ":", frequency)


Positive Bigram Frequency:
('look', 'good') : 351
('good', 'stick') : 7
('stick', 'good') : 8
('good', 'dont') : 111
('dont', 'like') : 1624
('like', 'rounded') : 10
('rounded', 'shape') : 6
('shape', 'always') : 1
('always', 'bumping') : 4
('bumping', 'siri') : 1
Negative Bigram Frequency:
('make', 'using') : 6
('using', 'home') : 11
('home', 'button') : 394
('button', 'easy') : 21
('easy', 'daughter') : 2
('daughter', 'like') : 2
('like', 'them') : 34
('them', 'would') : 32
('would', 'purchase') : 96
('purchase', 'again') : 64


## Sentimental analysis

In [8]:
def determine_sentiment(review, positive_bigram_frequency, negative_bigram_frequency, positive_bigrams, negative_bigrams):
    words = review.lower().split()
    review_bigrams = [(words[i], words[i+1]) for i in range(len(words) - 1)]
    # Calculating total frequency of positive and negative bigrams
    total_positive_frequency = sum(positive_bigram_frequency.get(bigram, 0) for bigram in review_bigrams if bigram in positive_bigrams)
    total_negative_frequency = sum(negative_bigram_frequency.get(bigram, 0) for bigram in review_bigrams if bigram in negative_bigrams)
    # Determining sentiment based on bigram frequencies
    if total_positive_frequency > total_negative_frequency:
        return 'Positive'
    elif total_positive_frequency < total_negative_frequency:
        return 'Negative'
    else:
        return 'Neutral'

# Precompute total frequencies of positive and negative bigrams
total_positive_frequency = sum(positive_bigram_frequency.values())
total_negative_frequency = sum(negative_bigram_frequency.values())

with open("output.txt", "w") as file:
    for review in reviews_subset[:10]:
        sentiment = determine_sentiment(review["reviewText"], positive_bigram_frequency, negative_bigram_frequency, positive_bigrams, negative_bigrams)
        file.write(f"Review Text: {review['reviewText']} | Frequency: {len(review['reviewText'].split())} | Sentiment: {sentiment}\n")
