### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

### Reading Dataset

In [2]:
movie_df = pd.read_csv('movies.csv')

In [3]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Feature Engineering

In [4]:
# Remove year from the 'title' column
movie_df['title'] = movie_df.title.str.replace('(\(\d\d\d\d\))','')

# Remove trailing space from title
movie_df['title'] = movie_df['title'].str.rstrip()

# Split genres which are separated by '|'
movie_df['genres'] = movie_df.genres.str.split('|')

In [5]:
movie_mod = movie_df.copy()

# Iterating every movie through index from dataframe, 1 if a genres is present  
x = []
for index, row in movie_df.iterrows():
    x.append(index)
    for genre in row['genres']:
        movie_mod.at[index, genre] = 1

In [45]:
# Drop all the columns excluding genres and fill 0 with NaN values
movie_mod.fillna(0,inplace=True)
all_genres = movie_mod.drop(['movieId','title','genres'],axis=1)
all_genres.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cosine Similarity

Cosine Similarity is widely used in computing similarity of documents as in numerical or tf-idf vectors form. <br>
As its name suggest cosine similarity finds how similar words are with the help of there word occurence with each other. <br>
Basically cosine similarity measures similarity between two vectors space with the help of angle between two vectors <br>determining how close/far this vectors are pointing as in hypothetical dimesions which is beyond human imagination

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
cs = cosine_similarity(all_genres,all_genres)
print(cs[:5])

[[1.         0.77459667 0.31622777 ... 0.         0.31622777 0.4472136 ]
 [0.77459667 1.         0.         ... 0.         0.         0.        ]
 [0.31622777 0.         1.         ... 0.         0.         0.70710678]
 [0.25819889 0.         0.81649658 ... 0.57735027 0.         0.57735027]
 [0.4472136  0.         0.70710678 ... 0.         0.         1.        ]]


In [18]:
# Get indices including movie names
indices = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()

In [34]:
def recommend_for(title, cs=cs):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    cs_scores = list(enumerate(cs[idx]))

    # Sort the movies 
    cs_scores = sorted(cs_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    cs_scores = cs_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in cs_scores]

    # Top 10 most similar movies
    return movie_mod['title'].iloc[movie_indices]

In [37]:
recommend_for('Jurassic Park')

615       Independence Day (a.k.a. ID4)
656                    Escape from L.A.
856                          Abyss, The
858                Escape from New York
1044           Star Trek: First Contact
1057    Star Trek II: The Wrath of Khan
1164     Lost World: Jurassic Park, The
1194                              Spawn
2193                       Total Recall
2712                          Moonraker
Name: title, dtype: object

### Natural Language Processing

If dataset has diverse features like movie's - 'overview', 'director', 'actor/actress' etc then data can also be applied through <br>
Natural Language Processing to focus & process important peace of data ranther than analyzing large amount of data as it is. 

TfidfVectorizer is a combination of two different model - CountVectorizer & TfidfTransformer 
That convert text into numerical form<br> and assign weights to each word with its importance corresponding to other

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(max_features=None, analyzer='word', stop_words = 'english')
#analyzer - if featureset is to made from 'word' or 'character'
#ngram_range - refers to word count which are more significant 
               #e.g - 'active passive voice'  - trigram , 'passive voice' - bigram , 'voice' - unigram
#min_df - minimum word word count that should occur
#stop_words - removing words that are unsignificant like - a,an,is etc.

In [None]:
# Fitting Model
tfv_matrix = tfv.fit_transform(movies_cleaned_df['genres'])

Sigmoid Kernel can also be used instead Cosine Similarity, it calculates sigmoid kernel between two vectors and 
can be seen in <br>
neural network architecture in a role of neuron activation function  

In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(tfv_matrix, tfv_matrix)