In [1]:
#importing req libraries
import pandas as pd
import numpy as np

In [2]:
movies=pd.read_csv('/content/dataset.csv')
print(movies.shape)
movies.head(10)

(10000, 9)


Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811
5,667257,Impossible Things,"Family,Drama",es,"Matilde is a woman who, after the death of her...",14.358,2021-06-17,8.6,255
6,129,Spirited Away,"Animation,Family,Fantasy",ja,"A young girl, Chihiro, becomes trapped in a st...",92.056,2001-07-20,8.5,13093
7,730154,Your Eyes Tell,"Romance,Drama",ja,"A tragic accident lead to Kaori's blindness, b...",51.345,2020-10-23,8.5,339
8,372754,Dou kyu sei – Classmates,"Romance,Animation",ja,"Rihito Sajo, an honor student with a perfect s...",14.285,2016-02-20,8.5,239
9,372058,Your Name.,"Romance,Animation,Drama",ja,High schoolers Mitsuha and Taki are complete s...,158.27,2016-08-26,8.5,8895


In [3]:
# we are trying to develop a model that will recommend based on the content
movies.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [4]:
#we need only few columns
#we are creating a new column that is tags : which has a combination of genre + overview
movies['tags']=movies['genre']+movies['overview']
movies.head(1)

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,tags
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,"Drama,CrimeFramed in the 1940s for the double ..."


In [5]:
#now we will drop the unnecessary columns
newdata=movies[['id','title','tags']]
newdata.head()

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."


In [6]:
#CountVectorizer class from scikit-learn, a Python library for machine learning.

from sklearn.feature_extraction.text import CountVectorizer

#CountVectorizer is a tool for converting a collection of text documents into a matrix of token (word) counts.
# It essentially counts the frequency of each word in the text documents.
cv=CountVectorizer(max_features=10000,stop_words='english')
cv


#Parameters:
#max_features=10000:
#Limits the vocabulary size to the 10,000 most frequent words.
#stop_words='english':
#Removes common English words (like 'the', 'is', 'and') that are not informative.

In [7]:
#declaring a new vector
vector=cv.fit_transform(newdata['tags'].values.astype('U')).toarray()
vector


#fit_transform:
#fit: Analyzes the text data in the column, builds a vocabulary of unique words (after applying stop word removal and limiting features to max_features as specified earlier).
#transform: Converts the text data into a matrix of token counts (a sparse matrix by default).



#value.as type('U):
#is used becuase: The column contains an integer (123), a string ('machine learning'), a None (missing value), and another string ('AI').
#If we attempt to use CountVectorizer directly on this column, it will throw an error because it expects all values to be strings.



array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
vector .shape

(2988, 10000)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
#It measures how "similar" two documents are based on the overlap in their word usage.
similiar=cosine_similarity(vector)

In [10]:
similiar

array([[1.        , 0.06253054, 0.05802589, ..., 0.07963978, 0.07597372,
        0.03798686],
       [0.06253054, 1.        , 0.08980265, ..., 0.        , 0.        ,
        0.        ],
       [0.05802589, 0.08980265, 1.        , ..., 0.02541643, 0.03636965,
        0.        ],
       ...,
       [0.07963978, 0.        , 0.02541643, ..., 1.        , 0.03327792,
        0.03327792],
       [0.07597372, 0.        , 0.03636965, ..., 0.03327792, 1.        ,
        0.04761905],
       [0.03798686, 0.        , 0.        , ..., 0.03327792, 0.04761905,
        1.        ]])

In [9]:
#taking an example
newdata[  newdata['title']=='Dilwale Dulhania Le Jayenge'  ]

Unnamed: 0,id,title,tags
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."


In [11]:
# a variable distance
enumerated_list = list(enumerate(similiar[1]))
#enumerated functions pairs the value in vecotr with its respective index for 2nd (ie index 1) row
distance = sorted(enumerated_list, reverse=True, key=lambda vector: vector[1])
# sorted= sorts the list, reverse=true for descding order ,sorting is to be done based on the cosine similiarity value so , vector[1]


In [16]:
distance     #so this prints how dilwale movie is close to others in the list where the fiirst column represent index of that and the 2nd one indicates its cosine similiarity value

[(1, 1.0000000000000002),
 (1155, 0.17131207221604822),
 (886, 0.16903085094570333),
 (2069, 0.15504341823651058),
 (651, 0.15000000000000002),
 (760, 0.15000000000000002),
 (1018, 0.15000000000000002),
 (467, 0.149071198499986),
 (892, 0.14659886790209342),
 (1120, 0.14509525002200233),
 (1960, 0.14509525002200233),
 (734, 0.14301938838683886),
 (2261, 0.1414213562373095),
 (1451, 0.13801311186847084),
 (2169, 0.13693063937629157),
 (2483, 0.13693063937629157),
 (671, 0.13552618543578768),
 (1659, 0.13552618543578768),
 (697, 0.1348399724926484),
 (1108, 0.1318760946791574),
 (2254, 0.1318760946791574),
 (2833, 0.1315587028960544),
 (220, 0.12909944487358058),
 (1310, 0.12909944487358058),
 (2695, 0.12824729401064427),
 (57, 0.1267731382092775),
 (887, 0.1267731382092775),
 (1858, 0.1267731382092775),
 (2948, 0.1267731382092775),
 (2901, 0.12403473458920847),
 (269, 0.1224744871391589),
 (335, 0.1224744871391589),
 (553, 0.1224744871391589),
 (876, 0.1224744871391589),
 (2049, 0.12247

In [12]:
#but if i need top 10 movies that are close to dilwale
for i in distance[0:11 ]:
  print(newdata.iloc[i[0]].title)
#i[0] : will access the row in data frame , means [1,0.8] [1]
#newdata.[1] will give us the particualr details of 1st row ie dilwae ,
#.title will give us onlly value

Dilwale Dulhania Le Jayenge
The Cheetah Girls: One World
A Passage to India
Fatal Attraction
Noelle
Just Married
The Graduate
Smart People
Sabrina
Maggie's Plan
Jenny's Wedding


In [13]:
#now we are writing a function that would recommend a movie
def recommend(movies):
  index=newdata[  newdata['title'] == movies  ].index[0]
  enumerated_list = list(enumerate(similiar[index]))
  distance = sorted(enumerated_list, reverse=True, key=lambda vector: vector[1])
  for i in distance[0:11]:
    print(newdata.iloc[i[0]].title)

In [14]:
recommend('Sabrina')

Sabrina
Kind Hearts and Coronets
Cheaper by the Dozen 2
Wizards of Waverly Place: The Movie
Cow Belles
Ugly, Dirty and Bad
The Damned
In the Name of the Land
The Wizards Return: Alex vs. Alex
Dirty Dancing
Over the Hedge


In [15]:
import pickle
pickle.dump(newdata, open('movie.pkl','wb'))
