# Plot based Similarity for movies

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

#### Dataset used is from https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots/data

In [2]:
df = pd.read_csv('wiki_movie_plots_deduped.csv')

In [3]:
print(df.shape)

(34886, 8)


In [4]:
print(df.columns)

Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')


In [5]:
print(df.head())

   Release Year                             Title Origin/Ethnicity  \
0          1901            Kansas Saloon Smashers         American   
1          1901     Love by the Light of the Moon         American   
2          1901           The Martyred Presidents         American   
3          1901  Terrible Teddy, the Grizzly King         American   
4          1902            Jack and the Beanstalk         American   

                             Director Cast    Genre  \
0                             Unknown  NaN  unknown   
1                             Unknown  NaN  unknown   
2                             Unknown  NaN  unknown   
3                             Unknown  NaN  unknown   
4  George S. Fleming, Edwin S. Porter  NaN  unknown   

                                           Wiki Page  \
0  https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...   
1  https://en.wikipedia.org/wiki/Love_by_the_Ligh...   
2  https://en.wikipedia.org/wiki/The_Martyred_Pre...   
3  https://en.wikipedia.

## Data Cleaning and Subset Selection

In [6]:
df_cleaned = df.dropna(subset=['Title','Plot'])
df_cleaned['Plot'] = df_cleaned['Plot'].str.lower()
df_cleaned['Genre'] = df_cleaned['Genre'].str.lower()
df_cleaned.count()

Release Year        34886
Title               34886
Origin/Ethnicity    34886
Director            34886
Cast                33464
Genre               34886
Wiki Page           34886
Plot                34886
dtype: int64

In [7]:
import re

def clean_plot(plot):
   if not isinstance(plot, str):
        return ""
   plot = re.sub(r'\s+', ' ', plot) #extra spaces
   plot = re.sub(r'[^a-zA-Z0-9.,?! ]', '', plot) #special characters
   return plot

In [8]:
df_cleaned['Plot'] = df_cleaned['Plot'].apply(clean_plot)

#### We only need a subset of this data. Originally this dataset has ~34000 rows, but we will carefully slice ~500 rows from this. Additionally, we will only take 3 columns: Release Year, Title, and Plot

#### In this case we will randomly select rows based on the genre in order to get a uniformly distributed data across all genres

In [9]:
np.random.seed(42)

In [10]:
genre_proportions = df_cleaned['Genre'].value_counts(normalize=True)
genre_proportions

Genre
unknown                          0.174368
drama                            0.170957
comedy                           0.125523
horror                           0.033452
action                           0.031474
                                   ...   
cbc-tv miniseries                0.000029
bio-drama                        0.000029
national film board docudrama    0.000029
cult drama                       0.000029
horror romantic comedy           0.000029
Name: proportion, Length: 2265, dtype: float64

In [11]:
top_20_genres = df_cleaned['Genre'].value_counts().nlargest(20).index

In [12]:
top_20_genres

Index(['unknown', 'drama', 'comedy', 'horror', 'action', 'thriller', 'romance',
       'western', 'crime', 'adventure', 'musical', 'crime drama',
       'romantic comedy', 'science fiction', 'film noir', 'mystery', 'war',
       'animation', 'comedy, drama', 'sci-fi'],
      dtype='object', name='Genre')

#### We are also going to take the unknown genre because it is the highest in proportion

In [13]:
dataset_size = 1000

In [14]:
df_top_20_genres = df_cleaned[df_cleaned['Genre'].isin(top_20_genres)]
selected_genre_proportions = df_top_20_genres['Genre'].value_counts(normalize=True)
selected_genre_proportions

Genre
unknown            0.233980
drama              0.229402
comedy             0.168436
horror             0.044888
action             0.042234
thriller           0.037157
romance            0.035503
western            0.033272
crime              0.021848
adventure          0.020232
musical            0.017963
crime drama        0.017848
romantic comedy    0.017732
science fiction    0.016078
film noir          0.013270
mystery            0.011924
war                0.010501
animation          0.010155
comedy, drama      0.009078
sci-fi             0.008501
Name: proportion, dtype: float64

In [15]:
samples_per_genre = max(1, dataset_size // len(top_20_genres))

In [16]:
df_subset = (
        df_top_20_genres.groupby('Genre', group_keys=False)
        .apply(lambda x: x.sample(min(len(x), samples_per_genre), random_state=42))
    )

# shuffled
df_subset = df_subset.sample(frac=1, random_state=42)

  .apply(lambda x: x.sample(min(len(x), samples_per_genre), random_state=42))


In [17]:
after_selection_proportions = df_subset['Genre'].value_counts(normalize=True)
after_selection_proportions

Genre
musical            0.05
sci-fi             0.05
mystery            0.05
horror             0.05
comedy, drama      0.05
drama              0.05
comedy             0.05
crime drama        0.05
science fiction    0.05
crime              0.05
war                0.05
western            0.05
adventure          0.05
thriller           0.05
animation          0.05
unknown            0.05
romance            0.05
film noir          0.05
romantic comedy    0.05
action             0.05
Name: proportion, dtype: float64

#### Now that every genre has uniform number of movies, we will move towards the analysis

## Preprocessing

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_text(df, column):
   max_features = 5000
   text_list = df[column].tolist()
   vectorizer = TfidfVectorizer(stop_words="english", max_features=max_features, max_df=0.85)
   tf_matrix = vectorizer.fit_transform(text_list)
   return tf_matrix, vectorizer

In [19]:
tf_matrix, vectorizer = vectorize_text(df_subset, 'Plot')
tf_matrix.shape

(1000, 5000)

## Similar Movies using Cosine Similarity

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

def similar_movies(input_descr, n,tf_matrix, vectorizer, df_subset):
   input_descr_vector = vectorizer.transform([input_descr])
   similarity_scores = cosine_similarity(input_descr_vector, tf_matrix).flatten()

   most_similar_indices = similarity_scores.argsort()[::-1][:n]
   top_n = [(similarity_scores[i], df_subset.iloc[i]) for i in most_similar_indices]

   return top_n

In [21]:
input_descr = "A thrilling story about people traveling across the space to save humanity from destruction"
top_movies = similar_movies(input_descr, 3, tf_matrix, vectorizer, df_subset)
print("Top Similar Movies:")
for score, movie in top_movies:
    print(f"Similarity: {score:.4f}")
    print(f"Title: {movie['Title']}")
    print(f"Release Year: {movie['Release Year']}")
    print(f"Genre: {movie['Genre']}")
    print(f"Plot: {movie['Plot']}")
    print("-" * 50)

Top Similar Movies:
Similarity: 0.2854
Title: Bodacious Space Pirates: Abyss of Hyperspace
Release Year: 2014
Genre: adventure
Plot: in the far future where space travel and colonialization have become the norm, humanity has expanded its living space to the far reaches of the known galaxy. one hundred years before the beginning of the series, several colonies, eager to gain their independence, rebelled against their masters in the stellar alliance colony federation. the government of one of the newly colonized planets, sea of the morningstar  in the tau ceti system, recruited space pirates to bolster its fighting forces, legalizing their actions by issuing them letters of marque. in the midst of this conflict, a galactic empire arose and absorbed both the stellar alliance and the border worlds, but allowed the colonies to run on an independent government. despite peace having been achieved, the space pirates remain respected figures in imperial society, even though their activities are