In [17]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../raw_data/movie_with_summary.csv')

In [15]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return words

In [7]:
df['gen_summary_pre'] = df['gen_summary'].apply(preprocess)

In [8]:
df['summary_str'] = df['gen_summary_pre'].apply(lambda x: ' '.join(map(str, x)))

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [9]:
tf_idf_vectorizer_new = TfidfVectorizer(stop_words='english')
tf_idf_matrix_new = tf_idf_vectorizer_new.fit_transform(df['summary_str'])

In [10]:
def find_recommendation_short(text):
    # Vectorise user input
    user_tf_idf_vector = tf_idf_vectorizer_new.transform([text])
    # Find similarities
    cos_similarities = cosine_similarity(user_tf_idf_vector, tf_idf_matrix_new).flatten()
    similar_movies = pd.DataFrame({'title': df['title'], 'similarity': cos_similarities})
    similar_movies = similar_movies.sort_values(by='similarity', ascending=False)
    top_20_recommendations = similar_movies.head(20)[['title', 'similarity']]
    recommendations = f"Top 20 recommendations:\n{top_20_recommendations.to_string(index=False)}"
    return top_20_recommendations

In [11]:
df_test = pd.read_csv('../raw_data/testing_sample.csv')

In [12]:
def evaluate_model(df, model):
    total_points = 0
    max_points = 50 * len(df)
    for index, row in df.iterrows():
        plot_summary = row['gen_summary']
        title = row['title']
        recommendations = model(plot_summary)
        recommendations.reset_index(drop=True, inplace=True)
        if not recommendations.empty and title in recommendations['title'].values:
            position = recommendations[recommendations['title'] == title].index[0]
            points = 50 - position
            total_points += points
            print(position)
        else:
            print(f"Title '{title}' not found in recommendations.")
            total_points += 0
    eval_score = total_points / max_points
    return eval_score

In [16]:
evaluate_model(df_test, find_recommendation_short)

0
1
0
0
0
0
0
7
3
0
0
0
0
0
0
0
1
0
0
0
9
3
0
3
0
Title 'Deathsport' not found in recommendations.
0
0
Title 'No Way to Treat a Lady' not found in recommendations.
Title 'Crying Freeman' not found in recommendations.
0
5
0
0
0
0
7
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0
1
Title 'On the Waterfront' not found in recommendations.
1
Title 'Heart and Souls' not found in recommendations.
0
Title 'Beyond the Valley of the Dolls' not found in recommendations.
0
0
6
0
0
0
0
0
0
1
0
0
0
0
1
8
0
0
8
0
3
0
0
0
0
0
2
0
0
2
0
0
0
0
Title 'Barbarosa' not found in recommendations.
0
0
7
0
18
0
0
1
0
0
Title 'K-PAX' not found in recommendations.
0
0
0
0
Title 'Mr. Popper's Penguins' not found in recommendations.
2
0
0
0
0
0
0
Title 'Stroszek' not found in recommendations.
0
0
0
0
0
0
Title 'Listen to Your Heart' not found in recommendations.
0
0
0
0
6
1
0
0
0
0
0
0
0
0
0
0
1
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
3
0
Title 'Ginger Snaps' not found in recommendations.
0
0
0
0
0
15
0
Title 'Filth' not found in recommenda

0.9086