<a href="https://colab.research.google.com/github/reolingovender/MIT805/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets

In [None]:
pip install pyspark

In [None]:
pip install emoji

In [None]:
pip uninstall -y tensorflow

In [None]:
pip install tensorflow-cpu

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import datasets
from datasets import load_dataset, load_from_disk
import numpy as np
import pandas as pd
import emoji
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
import numpy as np
import re
import string
import nltk
from gensim.parsing.preprocessing import remove_stopwords
from transformers import pipeline
from pyspark.sql.functions import udf, col

In [None]:
# Initialize Spark session
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

spark = SparkSession.builder.appName("ResourceOptimizedSession").master("local[*]").config("spark.executor.memory", "100g").config("spark.driver.memory", "50g").config("spark.executor.cores", "8").config("spark.sql.shuffle.partitions", "200").getOrCreate()

In [None]:
datasets.logging.set_verbosity_error()
year = 2022
dataset = load_from_disk("/content/drive/MyDrive/MIT805/Merged/Books_{}".format(year))

In [None]:
df = dataset.to_pandas()
df['categories'] = df['categories'].apply(lambda x: ', '.join(x) if len(x) > 0 else '').astype(str)
df = df[['rating', 'title', 'text', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase', 'book_title', 'price', 'store', 'categories']]
df.head()

In [None]:
# Convert Pandas DataFrames to PySpark DataFrames
pyspark_df_reviews = spark.createDataFrame(df)

# Convert 'timestamp' to correct formatting
pyspark_df_reviews = pyspark_df_reviews.withColumn("timestamp", f.date_format(f.from_unixtime(col("timestamp") / 1000), "yyyy-MM-dd")).filter("cast(price as int) > 0").na.drop()

# Show the final DataFrame
pyspark_df_reviews.show()

In [None]:
pyspark_df_reviews.count()

In [None]:
n = 100000
pyspark_df_reviews = pyspark_df_reviews.orderBy(f.rand()).limit(n)
pyspark_df_reviews.count()

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define the path where the model is saved
save_directory = "/content/drive/MyDrive/MIT805/sentiment_analysis_model/saved_model"

# Load the model and tokenizer from the saved directory
model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Use the loaded model and tokenizer
print("Model and tokenizer loaded successfully!")

# Initialize the sentiment analysis pipeline with the loaded model and tokenizer
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Compile regex for cleaning
cleanup_re = re.compile('[^a-z]+')

# Convert emoji to words
def demoji(text):
    return emoji.demojize(text)

# Clean up the text
def cleanup(sentence):
    sentence = str(sentence).lower()
    sentence = cleanup_re.sub(' ', sentence).strip()
    return sentence

# Classify the sentiment and return the score
def classifier_func(text):
    try:
        return float(classifier(text)[0]['score'])
    except Exception as e:
        print(f"Error processing text: {text}, {e}")
        return None

In [None]:
from pyspark.sql.types import FloatType, IntegerType, StringType

# Register UDFs with PySpark
demoji_udf = udf(demoji, StringType())
cleanup_udf = udf(cleanup, StringType())
classifier_udf = udf(classifier_func, FloatType())

# Apply UDFs to the PySpark DataFrame
pyspark_df_reviews = pyspark_df_reviews.withColumn("cleaned_text", demoji_udf(col("text")))
pyspark_df_reviews = pyspark_df_reviews.withColumn("cleaned_text", cleanup_udf(col("cleaned_text")))
pyspark_df_reviews = pyspark_df_reviews.withColumn("sentiment_score", classifier_udf(col("cleaned_text")))

# Show the results
pyspark_df_reviews = pyspark_df_reviews.select("title", "rating", "helpful_vote", 'asin', 'parent_asin', 'timestamp', 'verified_purchase','sentiment_score').na.drop()

In [None]:
pyspark_df_reviews.count()

In [None]:
# Write DataFrame to CSV
pyspark_df_reviews.write.csv("/content/drive/MyDrive/MIT805/Results/Sentiment_Analysis/{}/".format(year), header=True, mode="overwrite")