In [28]:
!pip install pandas nltk scikit-learn

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------ --------------------------------

**Import csv file using pandas**


In [21]:
import pandas as pd

df = pd.read_csv("imdb.csv")

Extract columns - Series_Title, Overview, Genre

In [None]:
# get columns Series_Title as Title, Overview as Description, and Genre as Genre
df = df[['Series_Title', 'Overview', 'Genre']]
df

Unnamed: 0,Series_Title,Overview,Genre
0,The Shawshank Redemption,Two imprisoned men bond over a number of years...,Drama
1,The Godfather,An organized crime dynasty's aging patriarch t...,"Crime, Drama"
2,The Dark Knight,When the menace known as the Joker wreaks havo...,"Action, Crime, Drama"
3,The Godfather: Part II,The early life and career of Vito Corleone in ...,"Crime, Drama"
4,12 Angry Men,A jury holdout attempts to prevent a miscarria...,"Crime, Drama"
...,...,...,...
995,Breakfast at Tiffany's,A young New York socialite becomes interested ...,"Comedy, Drama, Romance"
996,Giant,Sprawling epic covering the life of a Texas ca...,"Drama, Western"
997,From Here to Eternity,"In Hawaii in 1941, a private is cruelly punish...","Drama, Romance, War"
998,Lifeboat,Several survivors of a torpedoed merchant ship...,"Drama, War"


**Data cleaning - Remove null value rows**

In [23]:
# get null value rows
null_rows = df[df.isnull().any(axis=1)]
# remove null value rows
df = df.dropna()
# reset index
df = df.reset_index(drop=True)
print(df.shape)

(1000, 3)


In [25]:
df.columns

Index(['Series_Title', 'Overview', 'Genre'], dtype='object')

In [26]:
# create a combined column of Title, Description, and Genre
df['Combined'] = df['Series_Title'] + ' ' + df['Overview'] + ' ' + df['Genre']
# display the first 5 rows of the combined column
df['Combined'].head()

0    The Shawshank Redemption Two imprisoned men bo...
1    The Godfather An organized crime dynasty's agi...
2    The Dark Knight When the menace known as the J...
3    The Godfather: Part II The early life and care...
4    12 Angry Men A jury holdout attempts to preven...
Name: Combined, dtype: object

In [27]:
# just keep two rows as data - Series_Title and Combined
df = df[['Series_Title', 'Combined']]
df.head()


Unnamed: 0,Series_Title,Combined
0,The Shawshank Redemption,The Shawshank Redemption Two imprisoned men bo...
1,The Godfather,The Godfather An organized crime dynasty's agi...
2,The Dark Knight,The Dark Knight When the menace known as the J...
3,The Godfather: Part II,The Godfather: Part II The early life and care...
4,12 Angry Men,12 Angry Men A jury holdout attempts to preven...


In [30]:
# remove stop words from cobined column
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = ' '.join([word for word in word_tokens if word.lower() not in stop_words])
    return filtered_text
df['Combined'] = df['Combined'].apply(remove_stopwords)
df.head(5)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\YY423RK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\YY423RK\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\YY423RK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Series_Title,Combined
0,The Shawshank Redemption,Shawshank Redemption Two imprisoned men bond n...
1,The Godfather,Godfather organized crime dynasty 's aging pat...
2,The Dark Knight,Dark Knight menace known Joker wreaks havoc ch...
3,The Godfather: Part II,Godfather : Part II early life career Vito Cor...
4,12 Angry Men,12 Angry Men jury holdout attempts prevent mis...


In [36]:
# now create a TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
# fit and transform the combined column
tfidf_matrix = tfidf.fit_transform(df['Combined'])
# get the shape of the matrix
print(tfidf_matrix.shape)
# now create a cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# create a function to get the top 5 similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # get the index of the movie that matches the title
    matches = df[df['Series_Title'] == title]
    if matches.empty:
        # If not found, find the most similar title using string similarity
        possible_titles = df['Series_Title'].tolist()
        close_matches = get_close_matches(title, possible_titles, n=1, cutoff=0.6)
        if not close_matches:
            print(f"Title '{title}' not found and no similar titles found.")
            return []
        print(f"Title '{title}' not found. Using closest match: '{close_matches[0]}'")
        idx = df[df['Series_Title'] == close_matches[0]].index[0]
    else:
        idx = matches.index[0]
    # get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # get the scores of the 5 most similar movies
    sim_scores = sim_scores[1:6]
    # get the movie indices
    movie_indices = [int(i[0]) for i in sim_scores]
    # return the top 5 most similar movies
    return df.iloc[movie_indices]['Series_Title'] if movie_indices else []

(1000, 6456)


In [None]:
get_recommendations('')

Title 'Love Actually' not found and no similar titles found.


[]

In [None]:
# if series title is not found the then use the most similar title
