In [34]:
import pandas as pd
import numpy as np

In [35]:
df = pd.read_csv('../raw_data/movie_with_summary.csv')

In [36]:
df

Unnamed: 0.1,Unnamed: 0,title,plot_synopsis,gen_summary
0,0,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","""Mr. Holland's Opus"" follows the life of a ded..."
1,1,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","""Scarface"" follows Cuban immigrant Tony Montan..."
2,5,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"After her husband's sudden death, a grieving w..."
3,6,Little Caesar,Small-time Italian-American criminals Caesar E...,"""Little Caesar"" follows the rise and fall of a..."
4,7,Savages,The movie begins with a video being shot of me...,Two marijuana entrepreneurs in California are ...
...,...,...,...,...
6199,7969,Thunderheart,"During the early 1970s, FBI agent Ray Levoi is...",A young FBI agent is sent to investigate a mur...
6200,7970,One Night of Love,Opera singer Mary Barrett (Grace Moore) leaves...,"A young opera singer, Mary, falls in love with..."
6201,7971,One False Move,"Three criminals, Ray, Pluto and Fantasia (Ray'...","In the film ""One False Move,"" a trio of crimin..."
6202,7972,Lucky Numbers,"In 1988 Russ Richards (John Travolta), the wea...","""Lucky Numbers"" follows the story of a TV weat..."


In [37]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return words

In [38]:
df['gen_summary_pre'] = df['gen_summary'].apply(preprocess)

In [39]:
df['summary_str'] = df['gen_summary_pre'].apply(lambda x: ' '.join(map(str, x)))

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [41]:
tf_idf_vectorizer = TfidfVectorizer(stop_words='english')

In [42]:
tf_idf_vectorizer.fit(df['summary_str'])

In [43]:
tf_idf_matrix = tf_idf_vectorizer.transform(df['summary_str'])

In [25]:
tf_idf_matrix_new

<6204x13419 sparse matrix of type '<class 'numpy.float64'>'
	with 158159 stored elements in Compressed Sparse Row format>

In [15]:
def find_recommendation_short(text):
    # Vectorise user input
    user_tf_idf_vector = tf_idf_vectorizer.transform([text])
    # Find similarities
    cos_similarities = cosine_similarity(user_tf_idf_vector, tf_idf_matrix_new).flatten()
    similar_movies = pd.DataFrame({'title': df['title'], 'similarity': cos_similarities})
    similar_movies = similar_movies.sort_values(by='similarity', ascending=False)
    top_40_recommendations = similar_movies.head(40)[['title', 'similarity']]
    #recommendations = f"Top 20 recommendations:\n{top_20_recommendations.to_string(index=False)}"
    return top_40_recommendations

In [50]:
df_test

NameError: name 'df_test' is not defined

In [17]:
find_recommendation_short('action film world war france')

Unnamed: 0,title,similarity
3532,Saints and Soldiers: Airborne Creed,0.275824
3946,Under the Tuscan Sun,0.273319
3481,13 Rue Madeleine,0.272737
3380,Days of Glory,0.267372
5374,Passchendaele,0.238147
871,Timeline,0.235314
1887,Last Action Hero,0.228093
1741,Waterloo Bridge,0.226905
1701,How I Won the War,0.22671
3247,Beach Red,0.224763


In [51]:
df_test = pd.read_csv('../raw_data/testing_sample.csv')

In [52]:
df_test['gen_summary']

Unnamed: 0.1,Unnamed: 0,title,plot_synopsis,gen_summary
0,7101,Only God Forgives,Julian is an American expatriate who runs a Mu...,A drug smuggler seeks revenge for his brother'...
1,3479,Frailty,(some of the following content first appeared ...,A man believes he is on a mission from God to ...
2,3533,Willard,Willard Stiles is a meek social misfit who lat...,A lonely young man discovers he can control ra...
3,7221,Tracers,Cam (Taylor Lautner) is a bike messenger in Ne...,A bike messenger in debt to a crime syndicate ...
4,1935,Alexander,The film is based on the life of Alexander the...,"""Alexander"" is an epic biographical film that ..."
...,...,...,...,...
195,3464,Next,Cris Johnson (Nicolas Cage) can see into his f...,A man with the ability to see into the future ...
196,2438,Tightrope,"The film opens with a young woman, Melanie Sil...",Detective Wes Block investigates a series of m...
197,5757,Serious Moonlight,"When Louise, a high-powered attorney (Meg Ryan...",A woman takes her husband hostage in an attemp...
198,6481,Men of War,"Nick Gunar, an ex-soldier, is down and out in ...",A group of mercenaries led by Nick Gunar is hi...


In [12]:
def evaluate_model(df, model):
    total_points = 0
    max_points = 50 * len(df)
    for index, row in df.iterrows():
        plot_summary = row['gen_summary']
        title = row['title']
        recommendations = model(plot_summary)
        recommendations.reset_index(drop=True, inplace=True)
        if not recommendations.empty and title in recommendations['title'].values:
            position = recommendations[recommendations['title'] == title].index[0]
            points = 50 - position
            total_points += points
            print(position)
        else:
            print(f"Title '{title}' not found in recommendations.")
            total_points += 0
    eval_score = total_points / max_points
    return eval_score

In [16]:
evaluate_model(df_test, find_recommendation_short)

0
1
0
0
0
0
0
7
3
0
0
0
0
0
0
0
1
0
0
0
9
3
0
3
0
Title 'Deathsport' not found in recommendations.
0
0
Title 'No Way to Treat a Lady' not found in recommendations.
Title 'Crying Freeman' not found in recommendations.
0
5
0
0
0
0
7
0
2
0
0
0
0
0
0
0
0
0
0
0
0
0
1
Title 'On the Waterfront' not found in recommendations.
1
Title 'Heart and Souls' not found in recommendations.
0
Title 'Beyond the Valley of the Dolls' not found in recommendations.
0
0
6
0
0
0
0
0
0
1
0
0
0
0
1
8
0
0
8
0
3
0
0
0
0
0
2
0
0
2
0
0
0
0
Title 'Barbarosa' not found in recommendations.
0
0
7
0
18
0
0
1
0
0
Title 'K-PAX' not found in recommendations.
0
0
0
0
Title 'Mr. Popper's Penguins' not found in recommendations.
2
0
0
0
0
0
0
Title 'Stroszek' not found in recommendations.
0
0
0
0
0
0
Title 'Listen to Your Heart' not found in recommendations.
0
0
0
0
6
1
0
0
0
0
0
0
0
0
0
0
1
2
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
3
0
Title 'Ginger Snaps' not found in recommendations.
0
0
0
0
0
15
0
Title 'Filth' not found in recommenda

0.9086

In [32]:
import pickle
with open('name of file.pkl', 'wb') as file:
    pickle.dump(name of variable, file)

In [44]:
with open('vectorized_summaries.pkl', 'wb') as file:
    pickle.dump(tf_idf_matrix, file)

In [46]:
with open('vectorized_summaries.pkl', 'rb') as file:
    saved_matrix = pickle.load(file)

In [47]:
with open('tf_idf_vectorizer.pkl', 'rb') as file:
    saved_vectorizer = pickle.load(file)

In [48]:
def find_recommendation_test(text):
    # Vectorise user input
    user_tf_idf_vector = saved_vectorizer.transform([text])
    # Find similarities
    cos_similarities = cosine_similarity(user_tf_idf_vector, saved_matrix).flatten()
    similar_movies = pd.DataFrame({'title': df['title'], 'similarity': cos_similarities})
    similar_movies = similar_movies.sort_values(by='similarity', ascending=False)
    top_40_recommendations = similar_movies.head(40)[['title', 'similarity']]
    #recommendations = f"Top 20 recommendations:\n{top_20_recommendations.to_string(index=False)}"
    return top_40_recommendations

In [49]:
find_recommendation_short('action film world war france')

Unnamed: 0,title,similarity
3532,Saints and Soldiers: Airborne Creed,0.275824
3946,Under the Tuscan Sun,0.273319
3481,13 Rue Madeleine,0.272737
3380,Days of Glory,0.267372
5374,Passchendaele,0.238147
871,Timeline,0.235314
1887,Last Action Hero,0.228093
1741,Waterloo Bridge,0.226905
1701,How I Won the War,0.22671
3247,Beach Red,0.224763
