In [177]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [178]:
movies = pd.read_csv("dataset/ml-latest/movies.csv")

In [None]:
movies.head()

In [180]:
import re

def clean_text(text, replace_character=""):
    """
    This method removes the unwanted characters and
    :param replace_character:
    :param text:
    :return:
    """
    text = re.sub("[^a-zA-Z0-9 ]", replace_character, text)
    return text

In [None]:
movies["genres"] = movies["genres"].apply(lambda text: clean_text(text=text, replace_character=" "))
movies.head()

In [182]:
def concatenate_columns(df: pd.DataFrame, cols_list: list) -> pd.Series:
    """
    This method is used to concatenate the values in multiple columns into a single column
    :param df: Dataframe
    :param cols_list: The list columns to be concatenated
    :return: Series
    """
    return df[cols_list].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [183]:
# concatenate title and genres into a single column for better results
movies["title_genres"] = concatenate_columns(movies, ["title", "genres"])

In [184]:
# clean title_genres and title columns
movies["clean_title_genres"] = movies["title_genres"].apply(clean_text)
movies["clean_title"] = movies["title"].apply(clean_text)

In [None]:
movies.head()

In [186]:

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 4))

tfidf = vectorizer.fit_transform(movies["clean_title_genres"])

In [187]:
def search(title):
    """
    This method is used to return 5 similar movies to the given movie
    :param title: movie title
    :return: dataframe with the 5 similar movies
    """
    title = clean_text(title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5: ]
    result = movies.iloc[indices].iloc[::-1]
    return result

In [None]:
search("The blind side")