In [9]:
pip install annoy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from annoy import AnnoyIndex

In [11]:

df = pd.read_csv("IMDB Dataset.csv")


df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
tfidf = TfidfVectorizer(stop_words='english')


tfidf_matrix = tfidf.fit_transform(df['review'])


tfidf_matrix.shape

(50000, 101583)

In [13]:
svd = TruncatedSVD(n_components=100)  # Reduce dimensions
reduced_matrix = svd.fit_transform(tfidf_matrix)
reduced_matrix.shape

(50000, 100)

In [14]:
# Indexing with Annoy
# Create an Annoy index with the same dimensionality as the reduced matrix
annoy_index = AnnoyIndex(reduced_matrix.shape[1], 'angular')


for i in range(reduced_matrix.shape[0]):
    annoy_index.add_item(i, reduced_matrix[i])

# Build the index with a specified number of trees (more trees give more accuracy but slower)
annoy_index.build(10)  




True

In [15]:
def recommend_movies_with_annoy(review_text, df, tfidf, svd, annoy_index, num_recommendations=5):
 
    review_tfidf = tfidf.transform([review_text])
    review_reduced = svd.transform(review_tfidf)
    
    # Get the nearest neighbors
    similar_reviews_indices = annoy_index.get_nns_by_vector(review_reduced[0], num_recommendations)
    
    # Retrieve the most similar reviews and their corresponding sentiments
    similar_reviews = df.iloc[similar_reviews_indices]
    
    return similar_reviews

# Recommend movies based on a sample review
sample_review = "I loved the acting and the storyline was great!"
recommended_reviews = recommend_movies_with_annoy(sample_review, df, tfidf, svd, annoy_index)

recommended_reviews


Unnamed: 0,review,sentiment
24613,Great job! Was very exciting and had great stu...,positive
42797,"Great movie, great actors, great soundtrack! I...",positive
35480,This one was a surprise and better than most f...,positive
41338,This show is without a doubt one of the greate...,positive
41225,"This movie has an all star cast, John Candy, R...",positive


In [16]:
from sklearn.metrics import precision_score, recall_score

def evaluate_recommendations(input_review, df, tfidf, svd, annoy_index, num_recommendations=5):
    # Get recommendations
    recommended_reviews = recommend_movies_with_annoy(input_review, df, tfidf, svd, annoy_index, num_recommendations)
    
    # Determine the sentiment of the input review
    input_sentiment = determine_sentiment(input_review, tfidf, svd, df)
    
    # Calculate precision and recall
    y_true = [input_sentiment] * num_recommendations  # All true labels are the sentiment of the input review
    y_pred = recommended_reviews['sentiment'].tolist()  # Predicted labels are the sentiments of the recommended reviews
    
    precision = precision_score(y_true, y_pred, pos_label='positive')
    recall = recall_score(y_true, y_pred, pos_label='positive')
    
    return precision, recall

def determine_sentiment(review_text, tfidf, svd, df):
    review_tfidf = tfidf.transform([review_text])
    review_reduced = svd.transform(review_tfidf)
    similarity_scores = cosine_similarity(review_reduced, svd.transform(tfidf.transform(df['review']))).flatten()
    closest_index = similarity_scores.argmax()
    return df.iloc[closest_index]['sentiment']

# Evaluate recommendations based on a sample review
sample_review = "I loved the acting and the storyline was great!"
precision, recall = evaluate_recommendations(sample_review, df, tfidf, svd, annoy_index)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


Precision: 1.00
Recall: 1.00
