In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#For Data Viz. 
import matplotlib.pyplot as plt
import seaborn as sns 

#For Warnings
import warnings
warnings.filterwarnings('ignore')

#For Options 
pd.set_option('display.max_columns' , None)
pd.set_option('display.max_rows', None)


In [2]:
movies_df = pd.read_csv('../artifacts/cleaned_dfs/movies_cleaned_df.csv')
movies_df.drop('Unnamed: 0' , axis = 1 , inplace = True)
rating_df = pd.read_csv('../artifacts/cleaned_dfs/ratings_cleaned_df.csv')
rating_df.drop('Unnamed: 0' , axis = 1 , inplace = True)

In [3]:
movies_df.dropna(inplace = True)

In [4]:
movies_df['Genres'] = movies_df['Genres'].str.replace("'", "")
movies_df['Genres'] = movies_df['Genres'].str.replace('"', '')

In [5]:
df = movies_df[['Title' , 'Genres' , 'overview']]
df.head()

Unnamed: 0,Title,Genres,overview
0,Toy Story,"[Animation, Childrens, Comedy]","Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,"[Adventure, Childrens, Fantasy]",When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,"[Comedy, Romance]",A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"[Comedy, Drama]","Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...


In [6]:
df['tag'] = df['Genres'] + df['overview']


In [7]:
df['tag'] = df['tag'].apply(lambda x : ''.join(x))

In [8]:
df.head()
df.drop(['Genres' , 'overview'] ,axis=1 , inplace = True)

In [9]:
df.head(1)

Unnamed: 0,Title,tag
0,Toy Story,"[Animation, Childrens, Comedy]Led by Woody, An..."


In [10]:
df['tag'] = df['tag'].apply(lambda x : x.lower())

In [11]:
df['tag'][0]

"[animation, childrens, comedy]led by woody, andy's toys live happily in his room until andy's birthday brings buzz lightyear onto the scene. afraid of losing his place in andy's heart, woody plots against buzz. but when circumstances separate buzz and woody from their owner, the duo eventually learns to put aside their differences."

In [12]:
from sklearn.feature_extraction.text import CountVectorizer 

cv = CountVectorizer(max_features=3000 , stop_words='english')

In [13]:
vectors = cv.fit_transform(df['tag']).toarray()

In [14]:
vectors[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
vectors.shape

(3813, 3000)

In [16]:
list(cv.get_feature_names_out())

['000',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '1920s',
 '1930s',
 '1940s',
 '1950s',
 '1957',
 '1960s',
 '1963',
 '1970s',
 '1980s',
 '1996',
 '19th',
 '20',
 '20th',
 '21',
 '24',
 '25',
 '30',
 '40',
 '50',
 '70',
 '90s',
 'abandoned',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abuse',
 'abusive',
 'academy',
 'accepts',
 'accident',
 'accidental',
 'accidentally',
 'accompanied',
 'accountant',
 'accused',
 'achieve',
 'act',
 'acting',
 'action',
 'actions',
 'activist',
 'actor',
 'actors',
 'actress',
 'acts',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'addict',
 'addicted',
 'addiction',
 'addition',
 'admiral',
 'adolescent',
 'adopt',
 'adopted',
 'adopts',
 'adult',
 'adulthood',
 'adults',
 'advanced',
 'advantage',
 'adventure',
 'adventures',
 'advertising',
 'advice',
 'affair',
 'affairs',
 'affected',
 'affections',
 'affluent',
 'africa',
 'african',
 'aftermath',
 'age',
 'aged',
 'agency',
 'agent',
 'agents',
 'aging',
 'ago',
 'agre

In [17]:
from nltk.stem import PorterStemmer 

ps = PorterStemmer()

In [18]:

def stem(text)  : 
    y = []
    
    for i in text.split() : 
        y.append(ps.stem(i))
        
    string = " ".join(y)
    
    return string


stem("[animation, childrens, comedy]l by woody, andy' toy live happili in hi room until andy' birthday bring buzz lightyear onto the scene. afraid of lose hi place in andy' heart, woodi plot against buzz. but when circumst separ buzz and woodi from their owner, the duo eventu learn to put asid their differences.")

"[animation, childrens, comedy]l by woody, andy' toy live happili in hi room until andy' birthday bring buzz lightyear onto the scene. afraid of lose hi place in andy' heart, woodi plot against buzz. but when circumst separ buzz and woodi from their owner, the duo eventu learn to put asid their differences."

In [19]:
df['tag'] = df['tag'].apply(stem)

In [20]:
list(cv.get_feature_names_out())

['000',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '1920s',
 '1930s',
 '1940s',
 '1950s',
 '1957',
 '1960s',
 '1963',
 '1970s',
 '1980s',
 '1996',
 '19th',
 '20',
 '20th',
 '21',
 '24',
 '25',
 '30',
 '40',
 '50',
 '70',
 '90s',
 'abandoned',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abuse',
 'abusive',
 'academy',
 'accepts',
 'accident',
 'accidental',
 'accidentally',
 'accompanied',
 'accountant',
 'accused',
 'achieve',
 'act',
 'acting',
 'action',
 'actions',
 'activist',
 'actor',
 'actors',
 'actress',
 'acts',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'addict',
 'addicted',
 'addiction',
 'addition',
 'admiral',
 'adolescent',
 'adopt',
 'adopted',
 'adopts',
 'adult',
 'adulthood',
 'adults',
 'advanced',
 'advantage',
 'adventure',
 'adventures',
 'advertising',
 'advice',
 'affair',
 'affairs',
 'affected',
 'affections',
 'affluent',
 'africa',
 'african',
 'aftermath',
 'age',
 'aged',
 'agency',
 'agent',
 'agents',
 'aging',
 'ago',
 'agre

In [21]:
from sklearn.metrics.pairwise import cosine_similarity 


similarity = cosine_similarity(vectors)


In [22]:
similarity[:5]

array([[1.        , 0.04454354, 0.02777778, ..., 0.        , 0.        ,
        0.        ],
       [0.04454354, 1.        , 0.05939139, ..., 0.        , 0.        ,
        0.        ],
       [0.02777778, 0.05939139, 1.        , ..., 0.        , 0.05143445,
        0.        ],
       [0.035007  , 0.        , 0.046676  , ..., 0.07312724, 0.03241019,
        0.05564149],
       [0.02777778, 0.        , 0.07407407, ..., 0.        , 0.05143445,
        0.        ]])

In [23]:
sorted(list(enumerate(similarity[0])) , reverse = True , key = lambda x : x[1])[1:6]

[(2996, 0.46225016352102427),
 (3055, 0.2041241452319315),
 (1944, 0.19518001458970663),
 (2255, 0.17407765595569788),
 (591, 0.16169041669088866)]

In [24]:
def recommend(movie) : 
    movie_index = df[df['Title'] == movie].index[0]
    distances = similarity[movie_index]
    
    movie_list = sorted(list(enumerate(distances)) , reverse = True , key = lambda x : x[1])[1:6]
    for i in movie_list : 
        print(df.iloc[i[0]].Title)
        
recommend('Toy Story')

Toy Story 2
Man on the Moon
Condorman
Bug's Life, A
Window to Paris


In [25]:
import pickle

movies = pickle.dump(df , open('../artifacts/pickle/movies.pkl' , 'wb'))
similarity_score = pickle.dump(similarity , open('../artifacts/pickle/similarity.pkl' , 'wb'))