**Import Libraries**

In [None]:
import numpy as np
import pandas as pd
import nltk

**import dataset**

In [None]:
movies = pd.read_csv('/content/movies.csv')

In [None]:
movies.head()

Unnamed: 0.1,Unnamed: 0,id,title,overview,tagline,genre_names
0,0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,Fear can hold you prisoner. Hope can set you f...,"Drama, Crime"
1,1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",An offer you can't refuse.,"Drama, Crime"
2,2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,The rise and fall of the Corleone empire.,"Drama, Crime"
3,3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Whoever saves one life, saves the world entire.","Drama, History, War"
4,4,389,12 Angry Men,The defense and the prosecution have rested an...,Life is in their hands. Death is on their minds.,Drama


In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9058 entries, 0 to 9057
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   9058 non-null   int64 
 1   id           9058 non-null   int64 
 2   title        9058 non-null   object
 3   overview     9057 non-null   object
 4   tagline      7631 non-null   object
 5   genre_names  9056 non-null   object
dtypes: int64(2), object(4)
memory usage: 424.7+ KB


In [None]:
movies.isnull().sum()

Unnamed: 0        0
id                0
title             0
overview          1
tagline        1427
genre_names       2
dtype: int64

In [None]:
#movies = movies.dropna()

In [None]:
movies.shape

(9058, 6)

# **feature Selection Part**

In [None]:
movies = movies[['id','title','overview','genre_names','tagline']]

In [None]:
movies.head(2)

Unnamed: 0,id,title,overview,genre_names,tagline
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama, Crime",Fear can hold you prisoner. Hope can set you f...
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime",An offer you can't refuse.


In [None]:
movies['keys'] = movies['overview']+movies['genre_names']+movies['tagline']

In [None]:
movies.head(2)

Unnamed: 0,id,title,overview,genre_names,tagline,keys
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama, Crime",Fear can hold you prisoner. Hope can set you f...,Framed in the 1940s for the double murder of h...
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama, Crime",An offer you can't refuse.,"Spanning the years 1945 to 1955, a chronicle o..."


In [None]:
movies_data = movies.drop(columns=['overview','genre_names','tagline'])

In [None]:
movies_data.head()

Unnamed: 0,id,title,keys
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,389,12 Angry Men,The defense and the prosecution have rested an...


# **Preprocess Data**

In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess_text(text):
    if isinstance(text, str):  # Check if 'text' is a string
        # Tokenization using WordTokenizer
        tokens = word_tokenize(text)

        # Remove special characters
        text = text.replace('[^a-zA-Z0-9]+', ' ')

        # Lemmatization and lowercase
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
        text = ' '.join(tokens)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        text = ' '.join(tokens)

        return text
    else:
        return ''  # Return an empty string if 'text' is not a string


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
movies_data['keywords'] = movies_data['keys'].apply(preprocess_text)

In [None]:
movies_data

Unnamed: 0,id,title,keys,keywords
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"framed 1940s double murder wife lover , upstan..."
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","spanning year 1945 1955 , chronicle fictional ..."
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,"continuing saga corleone crime family , young ..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...,true story businessman oskar schindler saved t...
4,389,12 Angry Men,The defense and the prosecution have rested an...,defense prosecution rested jury filing jury ro...
...,...,...,...,...
9053,94365,The Human Centipede 3 (Final Sequence),Taking inspiration from The Human Centipede fi...,"taking inspiration human centipede film , ward..."
9054,485774,The Open House,A teenager and his mother find themselves besi...,teenager mother find besieged threatening forc...
9055,9544,FearDotCom,When four bodies are discovered among the indu...,four body discovered among industrial decay ur...
9056,10870,Lucky Luke and the Daltons,,


# **Vectorize preprocessed data**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=9058)

In [None]:
vector = cv.fit_transform(movies_data['keywords'].values.astype('U')).toarray()

In [None]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
vector.shape

(9058, 9058)

## **Finding Cosine similarity**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similar = cosine_similarity(vector)

In [None]:
similar

array([[1.        , 0.04652421, 0.04761905, ..., 0.02353104, 0.        ,
        0.04941662],
       [0.04652421, 1.        , 0.39545579, ..., 0.        , 0.        ,
        0.14484136],
       [0.04761905, 0.39545579, 1.        , ..., 0.09412416, 0.        ,
        0.09883324],
       ...,
       [0.02353104, 0.        , 0.09412416, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.04941662, 0.14484136, 0.09883324, ..., 0.        , 0.        ,
        1.        ]])

**Code to find index from title name**

In [None]:
movies_data[movies_data['title']=="Iron Man"].index[0]

877

In [None]:
similar[877]

array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
       0.10741723])

**Sort in descending order by similarity value to find most similar indexes**

In [None]:
sorted(list(enumerate(similar[877])),reverse=True,key = lambda similarity_tuple:similarity_tuple[1])

[(877, 1.0000000000000002),
 (3371, 0.2711630722733202),
 (6791, 0.23312620206007845),
 (3735, 0.23078316568852544),
 (4421, 0.21693045781865616),
 (5270, 0.21128856368212914),
 (5809, 0.2051956704170308),
 (2021, 0.1978141420187361),
 (1772, 0.19518001458970663),
 (9001, 0.19462473604038075),
 (6555, 0.18650096164806276),
 (6834, 0.18650096164806276),
 (6069, 0.18257418583505539),
 (8164, 0.18257418583505539),
 (8287, 0.18257418583505539),
 (8886, 0.18257418583505539),
 (5441, 0.17928429140015906),
 (7455, 0.17928429140015906),
 (1965, 0.17902871850985821),
 (8507, 0.17888543819998318),
 (895, 0.17541160386140586),
 (5273, 0.17541160386140586),
 (919, 0.1732050807568877),
 (3232, 0.17213259316477408),
 (3525, 0.17213259316477408),
 (7485, 0.17213259316477408),
 (1796, 0.16854996561581054),
 (5467, 0.16770509831248423),
 (6434, 0.16770509831248423),
 (7709, 0.16770509831248423),
 (409, 0.16609095970747992),
 (2377, 0.16609095970747992),
 (2466, 0.16609095970747992),
 (4732, 0.166090959

In [None]:
movies_data[movies_data['title']=="Iron Man"].index[0]

877

In [None]:
sorted_similarty = sorted(list(enumerate(similar[877])),reverse=True,key = lambda similarity_tuple:similarity_tuple[1])

In [None]:
for i in sorted_similarty[1:6]:
  print(movies_data.iloc[i[0]].title)

Iron Man 3
The New Mutants
Iron Man 2
Batman: Gotham Knight
Heavy Metal


# **Function to Recommend Movie**

In [None]:
def recommend_movie(movie):
  index = movies_data[movies_data['title']==movie].index[0]
  sorted_similarty = sorted(list(enumerate(similar[index])),reverse=True,key = lambda similarity_tuple:similarity_tuple[1])
  for i in sorted_similarty[1:6]:
    print(movies_data.iloc[i[0]].title)

In [None]:
recommend_movie('Batman: Gotham Knight')

Batman: Bad Blood
Batman: The Dark Knight Returns, Part 1
Batman: Year One
Batman: Gotham by Gaslight
Batman: Mask of the Phantasm


In [None]:
import pickle
pickle.dump(movies_data, open('movies_data.pkl', 'wb'))
pickle.dump(similar, open('similarity.pkl', 'wb'))