In [126]:
import pandas as pd

In [127]:
!pip install scikit-learn



## 0. Loading dataset

In [128]:
names = ['character.metadata.tsv', 'movie.metadata.tsv', 'name.clusters.txt', 'plot_summaries.txt', 'tvtropes.clusters.txt']

path = '/content/'

## 1. About the dataset

[Dataset Link](https://www.cs.cmu.edu/~ark/personas/)

Each movie is identified by a Wikipedia movie ID and Freebase movie ID.

**Character** contains different movie characters referenced by the Freebase Character ID.

**Movie** contains information on revenue, langauges, release data, and more for movies, referenced by Wikipedia ID.

**Name Clusters** are different names of characters in movies.

**Plot Summaries** include plot summaries paired with Wikipedia IDs.

**TV Trope Clusters** contain different tropes matched with movie characters according to tv trope website.

#### Character

In [129]:
character = pd.read_csv(path + 'character.metadata.tsv', sep='\t', header=None)

character.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [130]:
character.shape

(450669, 13)

#### Movie

In [131]:
movie = pd.read_csv(path + 'movie.metadata.tsv', sep='\t', header=None)

movie.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [132]:
movie.rename(columns={
    0: 'Wikipedia_ID',
    1: 'Freebase_ID',
    2: 'Name',
    3: 'Release Date',
    4: 'Revenue',
    5:'Runtime',
    6: 'Languages',
    7: 'Countries',
    8: 'Genres'
  }, inplace=True)

movie.head()

Unnamed: 0,Wikipedia_ID,Freebase_ID,Name,Release Date,Revenue,Runtime,Languages,Countries,Genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [133]:
movie.shape

(81741, 9)

In [134]:
movie.isna().sum()

Unnamed: 0,0
Wikipedia_ID,0
Freebase_ID,0
Name,0
Release Date,6902
Revenue,73340
Runtime,20450
Languages,0
Countries,0
Genres,0


#### Name Clusters

In [135]:
name_clusters = pd.read_csv(path + 'name.clusters.txt', sep='\t', header=None)

name_clusters.head()

Unnamed: 0,0,1
0,Stuart Little,/m/0k3w9c
1,Stuart Little,/m/0k3wcx
2,Stuart Little,/m/0k3wbn
3,John Doe,/m/0jyg35
4,John Doe,/m/0k2_zn


In [136]:
name_clusters.rename(columns={
    0: 'Character',
    1: 'Freebase_Character_ID'
})

Unnamed: 0,Character,Freebase_Character_ID
0,Stuart Little,/m/0k3w9c
1,Stuart Little,/m/0k3wcx
2,Stuart Little,/m/0k3wbn
3,John Doe,/m/0jyg35
4,John Doe,/m/0k2_zn
...,...,...
2661,John Rolfe,/m/0k5_ql
2662,John Rolfe,/m/02vd6vs
2663,Elizabeth Swann,/m/0k1xvz
2664,Elizabeth Swann,/m/0k1x_d


In [137]:
name_clusters.shape

(2666, 2)

#### Plot Summaries

In [138]:
plot_summaries = pd.read_csv(path + 'plot_summaries.txt', sep='\t', header=None)

plot_summaries.head()

Unnamed: 0,0,1
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [139]:
plot_summaries.rename(columns={
    0: 'Wikipedia_ID',
    1: 'Plot'},
    inplace= True
)

In [140]:
plot_summaries.shape

(42303, 2)

In [141]:
plot_summaries.columns

Index(['Wikipedia_ID', 'Plot'], dtype='object')

In [142]:
movie.columns

Index(['Wikipedia_ID', 'Freebase_ID', 'Name', 'Release Date', 'Revenue',
       'Runtime', 'Languages', 'Countries', 'Genres'],
      dtype='object')

#### TV Trope Clusters

In [143]:
tv_trope = pd.read_csv(path + 'tvtropes.clusters.txt', sep='\t', header=None)

tv_trope.head()

Unnamed: 0,0,1
0,absent_minded_professor,"{""char"": ""Professor Philip Brainard"", ""movie"":..."
1,absent_minded_professor,"{""char"": ""Professor Keenbean"", ""movie"": ""Richi..."
2,absent_minded_professor,"{""char"": ""Dr. Reinhardt Lane"", ""movie"": ""The S..."
3,absent_minded_professor,"{""char"": ""Dr. Harold Medford"", ""movie"": ""Them!..."
4,absent_minded_professor,"{""char"": ""Daniel Jackson"", ""movie"": ""Stargate""..."


In [144]:
tv_trope.rename(columns={
    0: 'Character_Type',
    1: 'Freebase_Character_ID'
})

Unnamed: 0,Character_Type,Freebase_Character_ID
0,absent_minded_professor,"{""char"": ""Professor Philip Brainard"", ""movie"":..."
1,absent_minded_professor,"{""char"": ""Professor Keenbean"", ""movie"": ""Richi..."
2,absent_minded_professor,"{""char"": ""Dr. Reinhardt Lane"", ""movie"": ""The S..."
3,absent_minded_professor,"{""char"": ""Dr. Harold Medford"", ""movie"": ""Them!..."
4,absent_minded_professor,"{""char"": ""Daniel Jackson"", ""movie"": ""Stargate""..."
...,...,...
496,young_gun,"{""char"": ""Morgan Earp"", ""movie"": ""Tombstone"", ..."
497,young_gun,"{""char"": ""Colorado Ryan"", ""movie"": ""Rio Bravo""..."
498,young_gun,"{""char"": ""Tom Sawyer"", ""movie"": ""The League of..."
499,young_gun,"{""char"": ""William H. 'Billy the Kid' Bonney"", ..."


In [145]:
tv_trope.shape

(501, 2)

## 2. Combining datasets using Wikipedia Movie ID

Merging plot summaries and movies into working df, movies_main, because not every movie in this dataset has a plot summary.



In [146]:
# Left Join On Wikipedia ID, working df called movies main

movies_main = pd.merge(plot_summaries, movie, how='left', on='Wikipedia_ID')

In [147]:
movies_main.head()

Unnamed: 0,Wikipedia_ID,Plot,Freebase_ID,Name,Release Date,Revenue,Runtime,Languages,Countries,Genres
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",/m/076w2lb,Taxi Blues,1990-09-07,,110.0,"{""/m/06b_j"": ""Russian Language""}","{""/m/0f8l9c"": ""France"", ""/m/05vz3zq"": ""Soviet ...","{""/m/07s9rl0"": ""Drama"", ""/m/03q4nz"": ""World ci..."
1,31186339,The nation of Panem consists of a wealthy Capi...,/m/0gkz15s,The Hunger Games,2012-03-12,686533290.0,142.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"":..."
2,20663735,Poovalli Induchoodan is sentenced for six yea...,/m/051zjwb,Narasimham,2000,,175.0,"{""/m/0999q"": ""Malayalam Language""}","{""/m/03rk0"": ""India""}","{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action""..."
3,2231378,"The Lemon Drop Kid , a New York City swindler,...",/m/06xtz3,The Lemon Drop Kid,1951-03-08,2300000.0,91.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...,/m/02tqm5,A Cry in the Dark,1988-11-03,6908797.0,121.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."


In [148]:
movies_main.isna().sum()

Unnamed: 0,0
Wikipedia_ID,0
Plot,0
Freebase_ID,99
Name,99
Release Date,2717
Revenue,34716
Runtime,6723
Languages,99
Countries,99
Genres,99


## 3. Recommendation

In [149]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies_main['Plot'])

In [150]:
def recommend_movies(user_query, movie_df, tfidf_matrix, vectorizer, top_n=5):
    # Transform the user's query into the TF-IDF vector space
    query_vec = vectorizer.transform([user_query])

    # Compute cosine similarity between query and movie plots
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Get indices of top matches
    top_indices = similarity.argsort()[-top_n:][::-1]

    # Return movie names and similarity scores
    recommendations = movie_df.iloc[top_indices][['Name', 'Plot']]
    recommendations['Similarity'] = similarity[top_indices]

    return recommendations

## 4. User

In [151]:
user_input = input("Enter what you'd like to watch: ")
recommendations = recommend_movies(user_input, movies_main, tfidf_matrix, vectorizer)

print("🎬 Movie Recommendations based on your query:")
display(recommendations)

Enter what you'd like to watch: Romance
🎬 Movie Recommendations based on your query:


Unnamed: 0,Name,Plot,Similarity
21044,Lovers and Liars,Anita is an American actress who decides to v...,0.311021
20082,L'Amour,"Donna and Jane , two American hippies in Pari...",0.28329
13354,That Dangerous Age,"A lady on the Isle of Capri, neglected by a hu...",0.279967
38677,Barefooted Youth,A good-hearted young gangster falls in love wi...,0.258296
23081,Ranmuthu Duwa,The film is about the discovery of an underwat...,0.251441
