In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import openpyxl
import requests
import joblib
import pickle
import pyarrow
import warnings
warnings.filterwarnings("ignore")


In [3]:
movies_data = pd.read_excel("movies_data.xlsx")

In [4]:
movies_data = movies_data.drop_duplicates()

In [5]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit([[str(year)] for year in set(movies_data['Release Year']).union({'Unknown'})])
encoded_year = encoder.transform([[str(year)] for year in movies_data['Release Year']])
encoded_df = pd.DataFrame(encoded_year, columns=encoder.get_feature_names_out(['Release Year']))
movies_data.reset_index(drop=True, inplace=True)
encoded_df.reset_index(drop=True, inplace=True)
movies_encoded = pd.concat([movies_data, encoded_df], axis=1)

In [6]:
movies_encoded['Genres'] = movies_encoded['Genres'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

In [7]:
existing_genres = set([genre for sublist in movies_encoded['Genres'] for genre in sublist])
mlb = MultiLabelBinarizer(classes=list(existing_genres) + ['Unknown_Genre'])
genres = mlb.fit_transform(movies_encoded["Genres"])
encoded_df2 = pd.DataFrame(genres,columns=mlb.classes_)
movies_encoded.reset_index(drop=True, inplace=True)
encoded_df2.reset_index(drop=True, inplace=True)
movies_encoded2 = pd.concat([movies_encoded,encoded_df2],axis=1)

In [8]:
movies_encoded2['Cast'] = movies_encoded2['Cast'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

In [9]:
existing_cast = set([cast for sublist in movies_encoded2['Cast'] for cast in sublist])
mlb2 = MultiLabelBinarizer(classes=list(existing_cast) + ['Unknown_Cast'])
cast = mlb2.fit_transform(movies_encoded2["Cast"])
encoded_df3 = pd.DataFrame(cast,columns=mlb2.classes_)
movies_encoded2.reset_index(drop=True, inplace=True)
encoded_df3.reset_index(drop=True, inplace=True)
movies_encoded3 = pd.concat([movies_encoded2,encoded_df3],axis=1)


In [10]:
movies_encoded3["Crew"] = movies_encoded3["Crew"].apply(lambda x: x.split(', ') if isinstance(x, str) else []).apply(lambda x: x if isinstance(x, list) else [x])

In [11]:
existing_crew = set([crew for sublist in movies_encoded3['Crew'] for crew in sublist])
mlb3 = MultiLabelBinarizer(classes=list(existing_crew) + ['Unknown_Crew'])
crew = mlb3.fit_transform(movies_encoded3["Crew"])
encoded_df4 = pd.DataFrame(crew,columns=mlb3.classes_)
movies_encoded3.reset_index(drop=True, inplace=True)
encoded_df4.reset_index(drop=True, inplace=True)
movies_encoded4 = pd.concat([movies_encoded3,encoded_df4],axis=1)

In [12]:
movies_encoded4["Keywords"]=movies_encoded4["Keywords"].apply(lambda x: x.split(", ") if isinstance(x,str) else [])

In [13]:
existing_keyw = set([keyw for sublist in movies_encoded4['Keywords'] for keyw in sublist])
mlb4 = MultiLabelBinarizer(classes=list(existing_keyw) + ['Unknown_Keyw'])
keyw = mlb4.fit_transform(movies_encoded4["Keywords"])
encoded_df5 = pd.DataFrame(keyw,columns=mlb4.classes_)
movies_encoded4.reset_index(drop=True, inplace=True)
encoded_df5.reset_index(drop=True, inplace=True)
movies_encoded5 = pd.concat([movies_encoded4,encoded_df5],axis=1)

In [14]:
movies_encoded5["Overview"] = movies_encoded5["Overview"].fillna("")

In [15]:
tf = TfidfVectorizer()
X = tf.fit_transform(movies_encoded5["Overview"])
overview = pd.DataFrame(X.toarray(),columns=tf.get_feature_names_out())
movies_encoded5.reset_index(drop=True, inplace=True)
overview.reset_index(drop=True, inplace=True)
movies_encoded_final = pd.concat([movies_encoded5,overview],axis=1)

In [16]:

features = movies_encoded_final.iloc[:, 9:] 
features_matrix = features.values

In [17]:
features_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(6703, 47266))

In [18]:
features_matrix.shape

(6703, 47266)

In [19]:
similarity_matrix = cosine_similarity(features_matrix)

In [20]:
def recommend_movie(movie_index, top_n=5):
    similarity_scores = similarity_matrix[movie_index]
    similar_movie_indices = similarity_scores.argsort()[-top_n-1:-1][::-1]
    return movies_encoded_final['Name'].iloc[similar_movie_indices]

recommended_movies = recommend_movie(3, top_n=5)
print("Recommended Movies:", recommended_movies)


Recommended Movies: 96              Mister
4754    Radhaamadhavam
4614             Meter
436                LIE
191         Mr. Kalyan
Name: Name, dtype: object


In [21]:
def SearchMovie(Name):
     BEARER_TOKEN = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI5MTcyYWVkNzU5YTA4Yzg3NzkzM2FlMzMxYmZjZTNkMiIsIm5iZiI6MTczMTEyMzMwNS43MDUsInN1YiI6IjY3MmVkODY5OWZkZGU4YzRiODhiY2E4NyIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.gWrxgrIbl2zG3D4Xd3Ea_VTPrmW4BW5RRe4tZtu2AKg"
     url = f"https://api.themoviedb.org/3/search/movie?query={Name}&include_adult=false&language=en-US&page=1"
     headers = {"accept": "application/json", "Authorization": f"Bearer {BEARER_TOKEN}"}
     response = requests.get(url, headers=headers)
     if response.json()['results']:
          movie_id=response.json()['results'][0]['id']
          url = f"https://api.themoviedb.org/3/movie/{movie_id}?append_to_response=credits%2Ckeywords&language=en-US"
          headers = {"accept": "application/json", "Authorization": f"Bearer {BEARER_TOKEN}"}
          response = requests.get(url, headers=headers)
          return response.json()
     else:
          print("Sorry Movie not Found")


In [22]:
def embed(resp):
     d1=encoder.transform([[resp["release_date"][0:4]]])
     genres = [item["name"] for item in resp["genres"]]
     d2=mlb.transform([genres])
     cast = [item["name"] for item in resp["credits"]["cast"]]
     d3=mlb2.transform([cast])
     crew = [item["name"] for item in resp["credits"]["crew"] if item['job']=='Director']
     d4=mlb3.transform([crew])
     keyword = [item["name"] for item in resp["keywords"]["keywords"]]
     d5=mlb4.transform([keyword])
     d6=tf.transform([resp["overview"]])

     embedding = np.concatenate((d1,d2,d3,d4,d5,d6.todense()),axis=1)
     return embedding
     
     

In [23]:
def recommend_combined_movies(input_movies, top_n=10):
    input_movie_embeddings = np.asarray(np.vstack([embed(movie) for movie in input_movies]))
    similarity_scores = cosine_similarity(input_movie_embeddings, features_matrix)
    aggregated_similarity = similarity_scores.sum(axis=0)
    similar_movie_indices = aggregated_similarity.argsort()[-top_n-1:-1][::-1]
    recommended_movies = movies_encoded['Name'].iloc[similar_movie_indices].tolist()
    return recommended_movies

In [30]:
def recommend_combined_movies(input_movies, priorities=None, top_n=10):
    if priorities is None:
        priorities = [1] * len(input_movies)  
    
    input_movie_embeddings = np.asarray(np.vstack([embed(movie) for movie in input_movies]))
    similarity_scores = cosine_similarity(input_movie_embeddings, features_matrix)

    weighted_similarity = (similarity_scores.T * np.array(priorities)).T  
    aggregated_similarity = weighted_similarity.sum(axis=0)

    similar_movie_indices = aggregated_similarity.argsort()[-top_n-1:-1][::-1]
    recommended_movies = movies_encoded['Name'].iloc[similar_movie_indices].tolist()

    return recommended_movies

In [25]:
def get_from_Single(Resp):    
     movie_embedding = np.asarray(embed(Resp))
     similarity_scores = cosine_similarity(movie_embedding, features_matrix)

     similar_movie_indices = similarity_scores.argsort()[0, -5:][::-1]  

     recommended_movies = movies_encoded['Name'].iloc[similar_movie_indices]
     print("Recommended Movies:", recommended_movies)

In [31]:
Resp = SearchMovie("Ala Vai")
Resp1 = SearchMovie("Son of Sat")
Resp2 = SearchMovie("Dookudu")

a=recommend_combined_movies([Resp,Resp1,Resp2],[7,5,1])

In [32]:
a

['Son of Satyamurthy',
 'DJ: Duvvada Jagannadham',
 'Wanted PanduGod',
 'Dookudu',
 'Julayi',
 'Aagadu',
 'Badrinath',
 'Akhil',
 'Jalsa',
 'Guntur Kaaram']

In [28]:
joblib.dump(encoder,"encoder.joblib")
joblib.dump(mlb,"mlb.joblib")
joblib.dump(mlb2,"mlb2.joblib")
joblib.dump(mlb3,"mlb3.joblib")
joblib.dump(mlb4,"mlb4.joblib")
joblib.dump(tf,"tf.joblib")

['tf.joblib']

In [33]:
np.save("features_matrix.npy", features_matrix)
movies_encoded.to_csv("movies_encoded.csv", index=False)
