In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('Data/Final_df.csv')

In [3]:
movies

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."
...,...,...,...
4801,9367,El Mariachi,el mariachi just wants to play his guitar and ...
4802,72766,Newlyweds,a newlywed couple's honeymoon is upended by th...
4803,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduces a dedic..."
4804,126186,Shanghai Calling,when ambitious new york attorney sam is sent t...


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4806 entries, 0 to 4805
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4806 non-null   int64 
 1   title     4806 non-null   object
 2   tags      4806 non-null   object
dtypes: int64(1), object(2)
memory usage: 112.8+ KB


### Stemming 

In [5]:
# Sneak peek
movies['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez jamescameron'

In [9]:
# %pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [11]:
# Func for stemming
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y) 

In [12]:
stem('in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez jamescameron')

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav stephenlang michellerodriguez jamescameron'

Stemming function is working...

In [13]:
# Implementing stemming function in tags column
movies['tags'] = movies['tags'].apply(stem)

# Vectorization

In [16]:
# %pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ----- ---------------------------------- 1.6/11.0 MB 11.9 MB/s eta 0:00:01
   --------------- ------------------------ 4.2/11.0 MB 11.4 MB/s eta 0:00:01
   ----------------------- ---------------- 6.6/11.0 MB 11.5 MB/s eta 0:00:01
   -------------------------------- ------- 8.9/11.0 MB 11.5 MB/s eta 0:00:01
   ---------------------------------------- 11.0/11.0 MB 11.4 MB/s eta 0:00:00
Using cached scipy-1.14.1-cp312-cp312-win_amd64.whl (44.5 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing colle


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
# max_features -> no of common words

In [18]:
vectors = cv.fit_transform(movies['tags']).toarray()

In [19]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [20]:
vectors.shape

(4806, 5000)

# Vector Cosine-Distance 

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
similarity = cosine_similarity(vectors)

In [23]:
cosine_similarity(vectors).shape # A dataframe of distances

(4806, 4806)

In [24]:
similarity[1]

array([0.08111071, 1.        , 0.05802589, ..., 0.023338  , 0.        ,
       0.02541643])

# Recommendation func

In [25]:
# fetching movie index
movies[movies['title'] == 'The Lego Movie'].index[0]

np.int64(744)

In [26]:
movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [27]:
def recommend(movie):
    index = movies[movies['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key = lambda x: x[1])
    for i in distances[1:6]:
        print(movies.iloc[i[0]].title)

In [28]:
recommend('Pirates of the Caribbean: At World\'s End')

Pirates of the Caribbean: Dead Man's Chest
Pirates of the Caribbean: The Curse of the Black Pearl
Pirates of the Caribbean: On Stranger Tides
20,000 Leagues Under the Sea
Life of Pi


In [29]:
# import pickle

In [30]:
# pickle.dump(movies,open('movie_list.pkl','wb'))
# pickle.dump(similarity,open('similarity.pkl','wb'))

In [32]:
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
