In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("C:/Users/mirya/OneDrive/Desktop/recommendation_system/TeluguMovies_dataset.csv")
data.head(5)

Unnamed: 0.1,Unnamed: 0,Movie,Year,Certificate,Genre,Overview,Runtime,Rating,No.of.Ratings
0,0,Bahubali: The Beginning,2015.0,UA,"Action, Drama","In ancient India, an adventurous and darin...",159,8.1,99114
1,1,Baahubali 2: The Conclusion,2017.0,UA,"Action, Drama","When Shiva, the son of Bahubali, learns ab...",167,8.2,71458
2,2,1 - Nenokkadine,2014.0,UA,"Action, Thriller",A rock star must overcome his psychologica...,170,8.1,42372
3,3,Dhoom:3,2013.0,UA,"Action, Thriller","When Sahir, a circus entertainer trained i...",172,5.4,42112
4,4,Ra.One,2011.0,U,"Action, Adventure, Sci-Fi",When the titular antagonist of an action g...,156,4.6,37211


In [4]:
data['Overview'].head()

0        In ancient India, an adventurous and darin...
1        When Shiva, the son of Bahubali, learns ab...
2        A rock star must overcome his psychologica...
3        When Sahir, a circus entertainer trained i...
4        When the titular antagonist of an action g...
Name: Overview, dtype: object

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
data['Overview'] = data['Overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(data['Overview'])
tfidf_matrix.shape

(1400, 5206)

In [8]:
tfidf_matrix

<1400x5206 sparse matrix of type '<class 'numpy.float64'>'
	with 16368 stored elements in Compressed Sparse Row format>

In [9]:
list(tfidf_matrix)

[<1x5206 sparse matrix of type '<class 'numpy.float64'>'
 	with 11 stored elements in Compressed Sparse Row format>,
 <1x5206 sparse matrix of type '<class 'numpy.float64'>'
 	with 15 stored elements in Compressed Sparse Row format>,
 <1x5206 sparse matrix of type '<class 'numpy.float64'>'
 	with 9 stored elements in Compressed Sparse Row format>,
 <1x5206 sparse matrix of type '<class 'numpy.float64'>'
 	with 19 stored elements in Compressed Sparse Row format>,
 <1x5206 sparse matrix of type '<class 'numpy.float64'>'
 	with 12 stored elements in Compressed Sparse Row format>,
 <1x5206 sparse matrix of type '<class 'numpy.float64'>'
 	with 16 stored elements in Compressed Sparse Row format>,
 <1x5206 sparse matrix of type '<class 'numpy.float64'>'
 	with 7 stored elements in Compressed Sparse Row format>,
 <1x5206 sparse matrix of type '<class 'numpy.float64'>'
 	with 15 stored elements in Compressed Sparse Row format>,
 <1x5206 sparse matrix of type '<class 'numpy.float64'>'
 	with 14

In [17]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[50:100]

array(['34', '365', '40', '40s', '45', '48', '50', '50s', '60', '600',
       '60s', '80s', '8th', '900', 'aadhi', 'aadi', 'aamani', 'abandoned',
       'abandons', 'abbas', 'abbasi', 'abduct', 'abducted', 'abhay',
       'abhi', 'abhimanyu', 'abhiram', 'abhishek', 'abides', 'abiding',
       'ability', 'able', 'abortion', 'abraham', 'abroad', 'absorbed',
       'abuse', 'academics', 'accept', 'accepting', 'accepts',
       'accessible', 'accident', 'accidental', 'accidentally',
       'accompanied', 'accompany', 'according', 'account', 'accountent'],
      dtype=object)

In [18]:
# using cosine similarity to calculate numeric quantity that denotes similarity 
# between two movies
# you have tfidf vectors the dot product between each vector gives directly\
# cosine similarity score 
# using linear_kernel() function instead of cosine_similarites() since it is faster

In [19]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [20]:
cosine_sim.shape

(1400, 1400)

In [21]:
cosine_sim[1]

array([0.        , 1.        , 0.        , ..., 0.        , 0.03445427,
       0.04135381])

In [22]:
# takes movie title as input and outputs a list of 10 similar movies\
# #you need a mechanism to identify index of movie in dataframe by giving title

In [23]:
indices = pd.Series(data.index, index = data['Movie']).drop_duplicates()
indices

Movie
Bahubali: The Beginning           0
Baahubali 2: The Conclusion       1
1 - Nenokkadine                   2
Dhoom:3                           3
Ra.One                            4
                               ... 
Maro Monagadu                  1395
Jakkanna                       1396
Muvva Gopaludu                 1397
Ninney Ishta Paddaanu          1398
9 Nelalu                       1399
Length: 1400, dtype: int64

In [53]:
def get_recommendations(Movie, cosine_sim = cosine_sim2):
    #Get the index of movie that matches the title
    index = indices[Movie]
    
    #get the pairwise simlarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[index]))
    
    #sort movies based on similarity score
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
    #get scores of top 10 similar movies
    sim_scores = sim_scores[1:11]
    
    #get movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    #return top 10 similar movies
    return data['Movie'].iloc[movie_indices]

    

In [35]:
get_recommendations('1 - Nenokkadine')

912                 Mugguru Monagallu
319     Uu Kodathara Ulikki Padathara
558                           Mesthri
801                    Srimannarayana
570                           Keshava
561                  Yuddham Sharanam
238                             Rebel
1045                   Muthyala Muggu
815               Preminchukundam Raa
1151            Chettaniki Kallu Levu
Name: Movie, dtype: object

In [38]:
data[['Movie','Overview','Genre']].head()

Unnamed: 0,Movie,Overview,Genre
0,Bahubali: The Beginning,"In ancient India, an adventurous and darin...","Action, Drama"
1,Baahubali 2: The Conclusion,"When Shiva, the son of Bahubali, learns ab...","Action, Drama"
2,1 - Nenokkadine,A rock star must overcome his psychologica...,"Action, Thriller"
3,Dhoom:3,"When Sahir, a circus entertainer trained i...","Action, Thriller"
4,Ra.One,When the titular antagonist of an action g...,"Action, Adventure, Sci-Fi"


In [41]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ","")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ",""))
        else:
            return ''

In [42]:
features = ['Movie','Overview','Genre']
for feature in features:
    data[feature] = data[feature].apply(clean_data)

In [43]:
def create_soup(x):
    return x['Movie'] + ' ' + x['Overview'] + ' ' + x['Genre']

In [44]:
data['soup'] = data.apply(create_soup,axis=1)

In [61]:
data[['soup']].head()

Unnamed: 0,soup
0,"bahubali:thebeginning inancientindia,anadventu..."
1,"baahubali2:theconclusion whenshiva,thesonofbah..."
2,1-nenokkadine arockstarmustovercomehispsycholo...
3,"dhoom:3 whensahir,acircusentertainertrainedinm..."
4,ra.one whenthetitularantagonistofanactiongamet...


In [62]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['soup'])

In [63]:
count_matrix.shape

(1400, 5434)

In [69]:
data.head()

Unnamed: 0.1,level_0,index,Unnamed: 0,Movie,Year,Certificate,Genre,Overview,Runtime,Rating,No.of.Ratings,soup
0,0,0,0,bahubali:thebeginning,2015.0,UA,"action,drama","inancientindia,anadventurousanddaringmanbecome...",159,8.1,99114,"bahubali:thebeginning inancientindia,anadventu..."
1,1,1,1,baahubali2:theconclusion,2017.0,UA,"action,drama","whenshiva,thesonofbahubali,learnsabouthisherit...",167,8.2,71458,"baahubali2:theconclusion whenshiva,thesonofbah..."
2,2,2,2,1-nenokkadine,2014.0,UA,"action,thriller",arockstarmustovercomehispsychologicalinhibitio...,170,8.1,42372,1-nenokkadine arockstarmustovercomehispsycholo...
3,3,3,3,dhoom:3,2013.0,UA,"action,thriller","whensahir,acircusentertainertrainedinmagicanda...",172,5.4,42112,"dhoom:3 whensahir,acircusentertainertrainedinm..."
4,4,4,4,ra.one,2011.0,U,"action,adventure,sci-fi",whenthetitularantagonistofanactiongametakesonp...,156,4.6,37211,ra.one whenthetitularantagonistofanactiongamet...


In [78]:
lists = [data['Movie'],data['Genre'],data['Overview'],data['soup']]
new_data = pd.DataFrame(lists).transpose()

In [79]:
new_data

Unnamed: 0,Movie,Genre,Overview,soup
0,bahubali:thebeginning,"action,drama","inancientindia,anadventurousanddaringmanbecome...","bahubali:thebeginning inancientindia,anadventu..."
1,baahubali2:theconclusion,"action,drama","whenshiva,thesonofbahubali,learnsabouthisherit...","baahubali2:theconclusion whenshiva,thesonofbah..."
2,1-nenokkadine,"action,thriller",arockstarmustovercomehispsychologicalinhibitio...,1-nenokkadine arockstarmustovercomehispsycholo...
3,dhoom:3,"action,thriller","whensahir,acircusentertainertrainedinmagicanda...","dhoom:3 whensahir,acircusentertainertrainedinm..."
4,ra.one,"action,adventure,sci-fi",whenthetitularantagonistofanactiongametakesonp...,ra.one whenthetitularantagonistofanactiongamet...
...,...,...,...,...
1395,maromonagadu,,,maromonagadu
1396,jakkanna,"comedy,drama",themovieisaboutanattemptbysuniltheprotagonistt...,jakkanna themovieisaboutanattemptbysunilthepro...
1397,muvvagopaludu,"drama,romance","muvvagopaluduisa1987indiantelugufilm,directedb...",muvvagopaludu muvvagopaluduisa1987indiantelugu...
1398,ninneyishtapaddaanu,,"herocharan(tarun)amiddleclassfamilyguy,ischeat...",ninneyishtapaddaanu herocharan(tarun)amiddlecl...


In [86]:
new_data[['soup']].head()

Unnamed: 0,soup
0,"bahubali:thebeginning inancientindia,anadventu..."
1,"baahubali2:theconclusion whenshiva,thesonofbah..."
2,1-nenokkadine arockstarmustovercomehispsycholo...
3,"dhoom:3 whensahir,acircusentertainertrainedinm..."
4,ra.one whenthetitularantagonistofanactiongamet...


In [87]:
from sklearn.feature_extraction.text import CountVectorizer

In [88]:
count1=CountVectorizer(stop_words='english')
count_matrix1=count.fit_transform(new_data['soup'])

In [89]:
count_matrix1.shape

(1400, 5434)

In [90]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix1, count_matrix1)


In [91]:
new_data = new_data.reset_index()
indices = pd.Series(new_data.index, index = new_data['Movie'])

In [98]:
new_data.head(30)

Unnamed: 0,index,Movie,Genre,Overview,soup
0,0,bahubali:thebeginning,"action,drama","inancientindia,anadventurousanddaringmanbecome...","bahubali:thebeginning inancientindia,anadventu..."
1,1,baahubali2:theconclusion,"action,drama","whenshiva,thesonofbahubali,learnsabouthisherit...","baahubali2:theconclusion whenshiva,thesonofbah..."
2,2,1-nenokkadine,"action,thriller",arockstarmustovercomehispsychologicalinhibitio...,1-nenokkadine arockstarmustovercomehispsycholo...
3,3,dhoom:3,"action,thriller","whensahir,acircusentertainertrainedinmagicanda...","dhoom:3 whensahir,acircusentertainertrainedinm..."
4,4,ra.one,"action,adventure,sci-fi",whenthetitularantagonistofanactiongametakesonp...,ra.one whenthetitularantagonistofanactiongamet...
5,5,dhoom:2,"action,thriller","mra,afearlessthief,stealsvaluableartefactsandt...","dhoom:2 mra,afearlessthief,stealsvaluableartef..."
6,6,eega,"action,fantasy",amurderedmanisreincarnatedasahouseflyandseekst...,eega amurderedmanisreincarnatedasahouseflyands...
7,7,krrish3,"action,sci-fi",krrishandhisscientistfatherhavetosavetheworlda...,krrish3 krrishandhisscientistfatherhavetosavet...
8,8,arjunreddy,"action,drama,romance","arjunreddy,ashorttemperedhousesurgeongetsusedt...","arjunreddy arjunreddy,ashorttemperedhousesurge..."
9,9,rangasthalam,"action,drama",thefearofhiselderbrother'sdeathstartstohauntan...,rangasthalam thefearofhiselderbrother'sdeathst...


In [99]:
get_recommendations('war',cosine_sim2)

1243           superpolice
1388         gudacharino.1
19          theghaziattack
32                  spyder
61                 singam2
71                 singam3
108             maanagaram
274     nenerajunenemantri
442                    118
448              aswathama
Name: Movie, dtype: object