# Recommendation System Assignment | IMDB Top 250 Movies Dataset

## Import Libraries

In [34]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Load the Dataset

In [35]:
df = pd.read_csv(r"C:\Ozy\Data Science\DS datasets\movies.csv")
df.head()

Unnamed: 0,rank,movie_id,title,year,link,imbd_votes,imbd_rating,certificate,duration,genre,...,director_id,director_name,writer_id,writer_name,storyline,user_id,user_name,review_id,review_title,review_content
0,1,tt0111161,The Shawshank Redemption,1994,https://www.imdb.com/title/tt0111161,2711075,9.3,R,2h 22m,Drama,...,nm0001104,Frank Darabont,"nm0000175,nm0001104","Stephen King,Frank Darabont","Over the course of several years, two convicts...","ur16161013,ur15311310,ur0265899,ur16117882,ur1...","hitchcockthelegend,Sleepin_Dragon,EyeDunno,ale...","rw2284594,rw6606154,rw1221355,rw1822343,rw1288...","Some birds aren't meant to be caged.,An incred...",The Shawshank Redemption is written and direct...
1,2,tt0068646,The Godfather,1972,https://www.imdb.com/title/tt0068646,1882829,9.2,R,2h 55m,"Crime,Drama",...,nm0000338,Francis Ford Coppola,"nm0701374,nm0000338","Mario Puzo,Francis Ford Coppola",The aging patriarch of an organized crime dyna...,"ur24740649,ur86182727,ur15794099,ur15311310,ur...","CalRhys,andrewburgereviews,gogoschka-1,Sleepin...","rw3038370,rw4756923,rw4059579,rw6568526,rw1897...","The Pinnacle Of Flawless Films!,An offer so go...",'The Godfather' is the pinnacle of flawless fi...
2,3,tt0468569,The Dark Knight,2008,https://www.imdb.com/title/tt0468569,2684051,9.0,PG-13,2h 32m,"Action,Crime,Drama",...,nm0634240,Christopher Nolan,"tt0468569,nm0634300,nm0634240,nm0275286,tt0468569","Writers,Jonathan Nolan,Christopher Nolan,David...",When the menace known as the Joker wreaks havo...,"ur87850731,ur1293485,ur129557514,ur12449122,ur...","MrHeraclius,Smells_Like_Cheese,dseferaj,little...","rw5478826,rw1914442,rw6606026,rw1917099,rw5170...","The Dark Knight,The Batman of our dreams! So m...","Confidently directed, dark, brooding, and pack..."
3,4,tt0071562,The Godfather Part II,1974,https://www.imdb.com/title/tt0071562,1285350,9.0,R,3h 22m,"Crime,Drama",...,nm0000338,Francis Ford Coppola,"nm0000338,nm0701374","Francis Ford Coppola,Mario Puzo",The early life and career of Vito Corleone in ...,"ur0176092,ur0688559,ur92260614,ur0200644,ur117...","Nazi_Fighter_David,tfrizzell,umunir-36959,DanB...","rw0135607,rw0135487,rw5049900,rw0135526,rw0135...",Breathtaking in its scope and tragic grandeur....,"Coppola's masterpiece is rivaled only by ""The ..."
4,5,tt0050083,12 Angry Men,1957,https://www.imdb.com/title/tt0050083,800954,9.0,Approved,1h 36m,"Crime,Drama",...,nm0001486,Sidney Lumet,nm0741627,Reginald Rose,The jury in a New York City murder trial is fr...,"ur1318549,ur0643062,ur0688559,ur20552756,ur945...","uds3,tedg,tfrizzell,TheLittleSongbird,henrique...","rw0060044,rw0060025,rw0060034,rw2262425,rw5448...","The over-used term ""classic movie"" really come...",This once-in-a-generation masterpiece simply h...


## Basic Information of Dataset

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rank            250 non-null    int64  
 1   movie_id        250 non-null    object 
 2   title           250 non-null    object 
 3   year            250 non-null    int64  
 4   link            250 non-null    object 
 5   imbd_votes      250 non-null    object 
 6   imbd_rating     250 non-null    float64
 7   certificate     249 non-null    object 
 8   duration        250 non-null    object 
 9   genre           250 non-null    object 
 10  cast_id         250 non-null    object 
 11  cast_name       250 non-null    object 
 12  director_id     250 non-null    object 
 13  director_name   250 non-null    object 
 14  writer_id       250 non-null    object 
 15  writer_name     250 non-null    object 
 16  storyline       250 non-null    object 
 17  user_id         250 non-null    obj

## Create the Recommendation Model

### We will use a content-based filtering approach, focusing on the genre and IMDB ratings.

In [37]:
# Combine relevant features into a single string
df['features'] = df['genre'] + ' ' + df['imbd_rating'].astype(str)

## Initialize the TF-IDF Vectorizer

In [38]:
tfidf_vectorizer = TfidfVectorizer()

## Transform the combined features into a TF-IDF matrix

In [39]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['features'])

## Calculate cosine similarity between the movies

In [40]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.46272942, 0.34511079, ..., 0.        , 0.25123808,
        0.2566824 ],
       [0.46272942, 1.        , 0.74581553, ..., 0.        , 0.11625525,
        0.1187745 ],
       [0.34511079, 0.74581553, 1.        , ..., 0.        , 0.08670497,
        0.08858386],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.22589745],
       [0.25123808, 0.11625525, 0.08670497, ..., 0.        , 1.        ,
        0.06448839],
       [0.2566824 , 0.1187745 , 0.08858386, ..., 0.22589745, 0.06448839,
        1.        ]])

## Function to get movie recommendations based on title

In [41]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df.index[df['title'] == title][0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

- Combine relevant features into a single string.
- Use the TF-IDF vectorizer to transform these features into a matrix of TF-IDF features.
- Calculate the cosine similarity between these TF-IDF matrices.
- Define a function get_recommendations that takes a movie title as input and returns the top 10 most similar movies based on cosine similarity scores.

 ## Test the recommendation function

In [42]:
get_recommendations('The Shawshank Redemption')

11                          Fight Club
17     One Flew Over the Cuckoo's Nest
66                     American Beauty
84                 Requiem for a Dream
87                           Capernaum
93                            The Hunt
98                               Ikiru
113                       A Separation
120                    Bicycle Thieves
130                      All About Eve
Name: title, dtype: object

- This basic recommendation system uses content-based filtering to suggest movies similar to a given movie title. 