In [None]:
# Import the other required packages and modules.
import pandas as pd
import datetime
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search


# From the IPython.display package, import display and Markdown
from IPython.display import display, Markdown

import torchvision

In [None]:
#read data
netflix = pd.read_csv("/Users/prasannasundar/Projects/Using OpenAI and Langchain/NLP and AI /data/netflix_dataset.csv") # parse column as datetime for visualization  
print(runway.info())

print(runway.head())

In [None]:
## Pre Processing

#remove forward slash 

runway['review_text_cleaned']=runway['review_text'].str.replace(r"\/","")

#remove punctuation
runway['review_text_cleaned']=runway['review_text_cleaned'].str.translate(string.punctuation)


#remove digits
runway['review_text_cleaned']=runway['review_text_cleaned'].str.replace(r"\d+","")

#remove running spaces

runway['review_text_cleaned']=runway['review_text_cleaned'].str.replace(r"\s{2,}","")  #removing 2 or more spaces
#make text lowercase
runway['review_text_cleaned']=runway['review_text_cleaned'].str.lower()

In [None]:
runway['review_text_cleaned'][4]

## Sentiment Analysis

In [None]:
#bert based model for sentiment analysis
senti_model = "distilbert-base-uncased-finetuned-sst-2-english" 


#instantiate new pipeline object

sentimentAnalysis = pipeline("sentiment-analysis",model=senti_model)

# Run on cleaned review text


sent_analysis_output = sentimentAnalysis(list(runway["review_text_cleaned"]))

## Histogram of Sentiment Score




In [None]:

#parse output as sentiment category label and score
runway['clean_sentiment_category'] = [s['label'] for s in sent_analysis_output]
runway['clean_sentiment_score'] = [s['score'] for s in sent_analysis_output]



sns.histplot(data=runway,x="clean_sentiment_score",bins=20)

plt.suptitle("Distribution of sentiment score")

plt.xlabel("Sentiment Score")

plt.ylabel("Count of reviews")


plt.title("For Clean Review Text")

In [None]:
#insight
# Most reviews are positive

In [None]:
#analyze reviews by year

chart_data = runway.groupby(['year', 'clean_sentiment_category'], as_index=False)['user_id'].count()
chart_data.columns = ['year', 'clean_sentiment_category', 'cnt']

# Create a bar plot showing the count of reviews for each sentiment category over the years.
sns.barplot(data=chart_data, x="year", y="cnt", hue="clean_sentiment_category", errorbar = None)
plt.title("Sentiment between 2016 - 2023")
plt.xlabel("Review Year")
plt.ylabel("Count of Reviews")
plt.show()

In [None]:
# Reviews have increased every year until 2022. The % of positive reviews also has been on an increeasing trend. 2021 was a game changer!

In [None]:
# check if unprocessed review data makes any difference for the sentiment category



sent_analysis_output2 = sentimentAnalysis(list(runway["review_text"]))


#parse output as sentiment category label and score
runway['clean_sentiment_category2'] = [s['label'] for s in sent_analysis_output2]
runway['clean_sentiment_score2'] = [s['score'] for s in sent_analysis_output2]


In [None]:
# confusion matrix

display( pd.crosstab(runway['clean_sentiment_category2'],runway['clean_sentiment_category2']))


display( pd.crosstab(runway['clean_sentiment_category'],runway['clean_sentiment_category']))

In [None]:
''' insight: we got almost same sentiment category from unprocessed review data.
this is because the transformers model BERT  uses self attention and gains context from sequences in texts, This allows the models to  work well with text as is and unstructured,
For this particular application we can use unprocessed data '''

In [None]:
# text embeddings
model_id = "sentence-transformers/all-MiniLM-L6-v2"

# Instantiate a new SentenceTransformer object.
model = SentenceTransformer(model_id)

# Generate the embeddings for the "rented for" column.
embeddings = model.encode(list(runway["rented for"]))

print(embeddings.shape)




## Semantic Search

In [None]:
# Define query
query = "a gorgeous and flattering dress"

# Embed query
query_emb = model.encode(query, convert_to_tensor=True)

# Calculate similarity between query and item embeddings
hits = semantic_search(query_emb, embeddings, top_k=3)


In [None]:

# Print top similar items
for x in x[0]:
    id = x['corpus_id']
    display(
        "ITEM ID: ", runway.iloc[id]['item_id'], 
        "; RENTED FOR: ", runway.iloc[id]['rented for'],
        "; REVIEW", runway.iloc[id]['review_text'],
        "; clean_sentiment_category", runway.iloc[id]['clean_sentiment_category'])