### Importing Dependencies

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import calendar

In [5]:
df = pd.read_csv('./dataset/preProcessed.csv')
df.head(3)

Unnamed: 0,id,popularity,release_date,runtime,title,vote_average,vote_count,director,content,posters
0,19995,150.437577,2009-12-10,162.0,Avatar,7.2,11800,['James Cameron'],action adventure fantasy sciencefiction cultur...,https://image.tmdb.org/t/p/w500/kyeqWdyUXW608q...
1,285,139.082615,2007-05-19,169.0,Pirates of the Caribbean: At World's End,6.9,4500,['Gore Verbinski'],adventure fantasy action ocean drugabuse exoti...,https://image.tmdb.org/t/p/w500/jGWpG4YhpQwVmj...
2,206647,107.376788,2015-10-26,148.0,Spectre,6.3,4466,['Sam Mendes'],action adventure crime spy basedonnovel secret...,https://image.tmdb.org/t/p/w500/672kUEMtTHcaVY...


In [6]:
# converting runtime 
df['runtime'] = df['runtime'].apply(lambda x: f'{int(x)//60}hr {int(x)%60}min')

# converting date
df['release_date'] = df['release_date'].apply(lambda x: f'{list(calendar.month_abbr)[int(x.split("-")[1])]}, {x.split("-")[0]}')

# converting director
df['director'] = df['director'].apply(lambda x: eval(x)[0] if len(eval(x))>0 else 'Unknown')

df.head()

Unnamed: 0,id,popularity,release_date,runtime,title,vote_average,vote_count,director,content,posters
0,19995,150.437577,"Dec, 2009",2hr 42min,Avatar,7.2,11800,James Cameron,action adventure fantasy sciencefiction cultur...,https://image.tmdb.org/t/p/w500/kyeqWdyUXW608q...
1,285,139.082615,"May, 2007",2hr 49min,Pirates of the Caribbean: At World's End,6.9,4500,Gore Verbinski,adventure fantasy action ocean drugabuse exoti...,https://image.tmdb.org/t/p/w500/jGWpG4YhpQwVmj...
2,206647,107.376788,"Oct, 2015",2hr 28min,Spectre,6.3,4466,Sam Mendes,action adventure crime spy basedonnovel secret...,https://image.tmdb.org/t/p/w500/672kUEMtTHcaVY...
3,49026,112.31295,"Jul, 2012",2hr 45min,The Dark Knight Rises,7.6,9106,Christopher Nolan,action crime drama thriller dccomic crimefight...,https://image.tmdb.org/t/p/w500/85cWkCVftiVs0B...
4,49529,43.926995,"Mar, 2012",2hr 12min,John Carter,6.1,2124,Andrew Stanton,action adventure sciencefiction basedonnovel m...,https://image.tmdb.org/t/p/w500/lCxz1Yus07QCQQ...


### Content Based Recommendation

In [7]:
# transforming content to a vector
transformer = TfidfVectorizer(max_features=5000)
X = transformer.fit_transform(df['content']).toarray()

In [8]:
# similarity matrix
similarity_matrix = cosine_similarity(X)
similarity_matrix

array([[1.        , 0.03527661, 0.02779555, ..., 0.02570502, 0.00494201,
        0.        ],
       [0.03527661, 1.        , 0.01993759, ..., 0.02803943, 0.01183318,
        0.        ],
       [0.02779555, 0.01993759, 1.        , ..., 0.02220417, 0.01585927,
        0.        ],
       ...,
       [0.02570502, 0.02803943, 0.02220417, ..., 1.        , 0.01359489,
        0.01943471],
       [0.00494201, 0.01183318, 0.01585927, ..., 0.01359489, 1.        ,
        0.01214987],
       [0.        , 0.        , 0.        , ..., 0.01943471, 0.01214987,
        1.        ]])

In [9]:
X.shape

(4799, 5000)

In [10]:
# converting the matrix in the dataframe
similarity_score = pd.DataFrame(similarity_matrix)
similarity_score

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4789,4790,4791,4792,4793,4794,4795,4796,4797,4798
0,1.000000,0.035277,0.027796,0.036023,0.107565,0.065098,0.004491,0.072805,0.017007,0.028110,...,0.010578,0.000000,0.020735,0.039442,0.000000,0.003766,0.005886,0.025705,0.004942,0.000000
1,0.035277,1.000000,0.019938,0.012100,0.068585,0.051036,0.024355,0.092917,0.051114,0.035738,...,0.011318,0.000000,0.016214,0.000000,0.000000,0.011843,0.000000,0.028039,0.011833,0.000000
2,0.027796,0.019938,1.000000,0.013003,0.039114,0.059314,0.005065,0.087673,0.068992,0.015218,...,0.058997,0.000000,0.000000,0.000000,0.012357,0.023389,0.000000,0.022204,0.015859,0.000000
3,0.036023,0.012100,0.013003,1.000000,0.011385,0.015647,0.015222,0.046982,0.050088,0.237385,...,0.002355,0.004863,0.019673,0.004455,0.001941,0.011743,0.000000,0.020186,0.051998,0.023149
4,0.107565,0.068585,0.039114,0.011385,1.000000,0.023577,0.042714,0.073302,0.005882,0.023218,...,0.018703,0.000000,0.000000,0.007330,0.000000,0.007593,0.000000,0.011341,0.004553,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4794,0.003766,0.011843,0.023389,0.011743,0.007593,0.014197,0.021498,0.029921,0.029332,0.006694,...,0.000000,0.006072,0.028438,0.022874,0.017806,1.000000,0.000000,0.004744,0.004204,0.017946
4795,0.005886,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.005878,0.000000,0.000000,0.024530,0.000000,1.000000,0.008338,0.005131,0.000000
4796,0.025705,0.028039,0.022204,0.020186,0.011341,0.025546,0.013443,0.000000,0.028137,0.013618,...,0.012658,0.005054,0.020802,0.032306,0.012039,0.004744,0.008338,1.000000,0.013595,0.019435
4797,0.004942,0.011833,0.015859,0.051998,0.004553,0.000000,0.020532,0.011051,0.013779,0.010715,...,0.000000,0.000000,0.009270,0.000000,0.009550,0.004204,0.005131,0.013595,1.000000,0.012150


In [11]:
# function for the recommendation
def recommender(movie_name):
    idx = list(df[df['title'] == movie_name].index)[0]
    scores = similarity_score.iloc[idx,:].sort_values(ascending=False)
    recom_movies_idx = scores.index[1:10]
    recom_movies_data = df.loc[recom_movies_idx, ]
    return recom_movies_data['title']

In [14]:
recommender("Harry Potter and the Chamber of Secrets")

191      Harry Potter and the Prisoner of Azkaban
113     Harry Potter and the Order of the Phoenix
197      Harry Potter and the Philosopher's Stone
8          Harry Potter and the Half-Blood Prince
114           Harry Potter and the Goblet of Fire
2568                                    The Craft
743                               Practical Magic
195       Night at the Museum: Secret of the Tomb
438                        Something's Gotta Give
Name: title, dtype: object

### Most Popular Movies

In [15]:
top_50 = df.sort_values(by='popularity', ascending=False)[:50]
top_50.head()

Unnamed: 0,id,popularity,release_date,runtime,title,vote_average,vote_count,director,content,posters
546,211672,875.581305,"Jun, 2015",1hr 31min,Minions,6.4,4571,Kyle Balda,family animation adventure comedy assistant af...,https://image.tmdb.org/t/p/w500/vlOgaxUiMOA8sP...
95,157336,724.247784,"Nov, 2014",2hr 49min,Interstellar,8.1,10867,Christopher Nolan,adventure drama sciencefiction savingtheworld ...,https://image.tmdb.org/t/p/w500/gEU2QniE6E77NI...
788,293660,514.569956,"Feb, 2016",1hr 48min,Deadpool,7.4,10995,Tim Miller,action adventure comedy antihero mercenary mar...,https://image.tmdb.org/t/p/w500/fSRb7vyIP8rQpL...
94,118340,481.098624,"Jul, 2014",2hr 1min,Guardians of the Galaxy,7.9,9742,James Gunn,action sciencefiction adventure marvelcomic sp...,https://image.tmdb.org/t/p/w500/r7vmZjiyZw9rpJ...
127,76341,434.278564,"May, 2015",2hr 0min,Mad Max: Fury Road,7.2,9427,George Miller,action adventure sciencefiction thriller futur...,https://image.tmdb.org/t/p/w500/hA2ple9q4qnwxp...


### Saving the resources 

In [16]:
# saving the movies dataframe
df.to_csv('./dataset/preProcessed.csv')

# saving the similarity scores 
similarity_score.to_csv('./dataset/similarity_score.csv', index=False)

# saving the top 50 popular movies
top_50.to_csv('./dataset/top_50_movies.csv', index=False)