In [24]:
import pandas as pd


df = pd.read_csv('movie_with_summary.csv')


df.head()

Unnamed: 0.1,Unnamed: 0,title,plot_synopsis,gen_summary
0,0,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","""Mr. Holland's Opus"" follows the life of a ded..."
1,1,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","""Scarface"" follows Cuban immigrant Tony Montan..."
2,5,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"After her husband's sudden death, a grieving w..."
3,6,Little Caesar,Small-time Italian-American criminals Caesar E...,"""Little Caesar"" follows the rise and fall of a..."
4,7,Savages,The movie begins with a video being shot of me...,Two marijuana entrepreneurs in California are ...


In [17]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words = [w for w in words if not w in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return words

In [25]:
# apply preprocess the the generated summaries
df['gen_summary'] = df['gen_summary'].apply(preprocess)

In [26]:
# Add all the tokenized words into strings
df['summary_str'] = df['gen_summary'].apply(lambda x: ' '.join(map(str, x)))

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Apply vectorizer to generated summary strings
tf_idf_vectorizer = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tf_idf_vectorizer.fit_transform(df['summary_str'])

In [35]:
def find_recommendation(text):
    # Vectorise user input
    user_tf_idf_vector = tf_idf_vectorizer.transform([text])
    # Find similarities
    cos_similarities = linear_kernel(user_tf_idf_vector, tf_idf_matrix).flatten()
    similar_movies = pd.DataFrame({'title': df['title'], 'similarity': cos_similarities})
    similar_movies = similar_movies.sort_values(by='similarity', ascending=False)
    top_10_recommendations = similar_movies.head(10)[['title', 'similarity']]
    recommendations = f"Top 10 recommendations:\n{top_10_recommendations.to_string(index=False)}"
    return recommendations

In [38]:
print(find_recommendation('movie about war in france, sad'))

Top 10 recommendations:
                              title  similarity
               Under the Tuscan Sun    0.272934
                   To Hell and Back    0.266080
                           Movie 43    0.253945
Saints and Soldiers: Airborne Creed    0.243280
                     Hamburger Hill    0.243260
                           Timeline    0.234983
                   13 Rue Madeleine    0.228554
                      Days of Glory    0.224059
                  Casualties of War    0.210394
                  How I Won the War    0.199116
