In [19]:
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
movies = pd.read_csv("../../data/processd/final_data.csv")

In [3]:
movies.shape

(4806, 3)

In [4]:
ps = PorterStemmer()

In [5]:
def stemmer(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [6]:
movies['tags'] = movies['tags'].apply(stemmer)

In [7]:
movies.iloc[0]['tags']

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav stephenlang michellerodriguez giovanniribisi joeldavidmoor cchpounder wesstudi lazalonso dileeprao mattgerald seananthonymoran jasonwhyt scottlawr kellykilgour jamespatrickpitt seanpatrickmurphi peterdillon kevindorman kelsonhenderson davidvanhorn jacobtomuri michaelblain-rozgay joncurri lukehawk woodyschultz petermensah soniaye jahnelcurfman ilramchoi kylawarren lisaroumain debrawilson chrismala taylorkibbi jodielandau julielamm cullenb.madden josephbradymadden frankietorr austinwilson sarawilson tamicawashington-mil lucybri nathanmeist gerryblair matthewchamberlain paulyat wraywil

In [8]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [9]:
vector = cv.fit_transform(movies['tags']).toarray() # type: ignore
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4806, 5000))

In [10]:
vector.shape

(4806, 5000)

In [11]:
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.06741999, 0.07644708, ..., 0.04264014, 0.        ,
        0.        ],
       [0.06741999, 1.        , 0.07559289, ..., 0.02108185, 0.        ,
        0.02357023],
       [0.07644708, 0.07559289, 1.        , ..., 0.02390457, 0.        ,
        0.        ],
       ...,
       [0.04264014, 0.02108185, 0.02390457, ..., 1.        , 0.04264014,
        0.04472136],
       [0.        , 0.        , 0.        , ..., 0.04264014, 1.        ,
        0.09534626],
       [0.        , 0.02357023, 0.        , ..., 0.04472136, 0.09534626,
        1.        ]], shape=(4806, 4806))

In [12]:
similarity.shape

(4806, 4806)

In [14]:
movies[movies['title'] == 'Spider-Man'].index[0]

np.int64(159)

In [15]:
def recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in movies_list:
        print(movies.iloc[i[0]].title)

In [17]:
recommend('Spider-Man')

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
Arachnophobia
The Amazing Spider-Man


In [20]:
pickle.dump(movies, open('../../models/movies.pkl', 'wb'))
pickle.dump(similarity, open('../../models/similarity.pkl', 'wb'))