# Content-Based Movie Recommendation

Dataset Source:

https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots/

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Dataset

In [None]:
df = pd.read_csv("C:\\Users\\14385\\Desktop\\GitHub Projects\\wiki_movie_plots_deduped.csv")
#"C:\Users\14385\Desktop\GitHub Projects\wiki_movie_plots_deduped.csv"
df.shape

In [None]:
df.sample(5)

## Preprocessing

### Converting to String

In [None]:
df.columns

In [None]:
df = df.astype(str)
df.dtypes

### Data Cleaning

Action Items:

* Lower-Case the whole data frame
* Director: Removing 'Director:' and 'Cast:'
* Director, Cast: Removing '\r\n', '\n' and '\r'

* Genre: Replacing '/' with Space
* Director, Cast, Genre: Removing 'Uknonwn' and 'Nan'

* Director: Separating Directors and Actors names
* Director, Cast: Checking if the names are separated with ' and ', ' & '

* Director, Cast: Merging the first names and last names together
* Director, Cast: Adding the words of 'Director' and 'Actor' as prefix

* Plot: Removing English Stopwords
* Doc: Removing special characters

In [None]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
df["Director"] = df["Director"].str.replace("director:", "", regex=False)
df["Director"] = df["Director"].str.replace("cast:", "", regex=False)

df["Director"] = df["Director"].str.replace("\r\n", " ", regex=False)
df["Cast"] = df["Cast"].str.replace("\r\n", " ", regex=False)

df["Director"] = df["Director"].str.replace("\n", " ", regex=False)
df["Cast"] = df["Cast"].str.replace("\n", " ", regex=False)

df["Genre"] = df["Genre"].str.replace("/", " ", regex=False)

df["Director"] = df["Director"].str.replace("unknown", "", regex=False)
df["Cast"] = df["Cast"].str.replace("unknown", "", regex=False)
df["Genre"] = df["Genre"].str.replace("unknown", "", regex=False)

df["Director"] = df["Director"].str.replace("nan", "", regex=False)
df["Cast"] = df["Cast"].str.replace("nan", "", regex=False)
df["Genre"] = df["Genre"].str.replace("nan", "", regex=False)


df["Director"] = df["Director"].str.replace(" and ", ",", regex=False)
df["Cast"] = df["Cast"].str.replace(" and ", ",", regex=False)
df["Director"] = df["Director"].str.replace(" & ", ",", regex=False)
df["Cast"] = df["Cast"].str.replace(" & ", ",", regex=False)


df["Director"] = df["Director"].str.replace(" ", "", regex=False)
df["Cast"] = df["Cast"].str.replace(" ", "", regex=False)

df["Director"] = df["Director"].str.replace(",", " ", regex=False)
df["Cast"] = df["Cast"].str.replace(",", " ", regex=False)


In [None]:
df["Director"] = np.where(df["Director"].str.len() > 0,
                          'director' + df["Director"],
                          df["Director"])

df["Cast"] = np.where(df["Cast"].str.len() > 0,
                      'actor' + df["Cast"],
                      df["Cast"])

df["Director"] = df["Director"].str.replace(" ", " director", regex=False)
df["Cast"] = df["Cast"].str.replace(" ", " actor", regex=False)


In [None]:
df.sample(5)

### Merging the document

In [None]:
column_weights = {"Release Year": 10,
                  "Title": 1,
                  "Origin/Ethnicity": 5,
                  "Director": 5,
                  "Cast": 1,
                  "Genre": 10,
                  "Plot": 1}

df["doc"] = ""

for col in column_weights.keys():
    df["doc"] += column_weights[col] * (df[col] + ' ')

df.head()

### Removing Special Characters

In [None]:
df["doc"] = df["doc"].str.replace("[^a-z 0-9]+", "", regex=True)

In [None]:
pd.set_option('display.max_colwidth', None)
df[["doc"]].sample(10)

### Removing Stopwords

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords

In [None]:
stops = stopwords.words('english')
print(stops)

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(lemma_words)

df['doc_clean'] = df['doc'].map(lambda s:preprocess(s)) 

Lemmatization aims to convert different inflected forms of a word into a single form to facilitate analysis and comparison. For example, the lemmatized form of "running" is "run", and the lemmatized form of "mice" is "mouse".



Stemming is a simpler proess compared to lemmatization. It involves removing suffixes from words to achieve the root form. Stemming can be less accurate than lemmatization but is often faster and sufficient for certain applications.

In [None]:
df[["doc", 'doc_clean']].sample(10)

## TF-IDF : Term Frequency - Inverse Document Frequency

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["doc_clean"])
column_names = vectorizer.get_feature_names_out()

df_tf_idf = pd.DataFrame(X.toarray(), columns=column_names)
df_tf_idf.shape

## Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

df_cos_sim = pd.DataFrame(cosine_similarity(df_tf_idf, dense_output=True))
df_cos_sim.shape

### Converting Cosine Similarity Dataframe to Top-K Items

In [None]:
import warnings

# hide pandas warning messages
warnings.filterwarnings('ignore')

In [None]:
from tqdm.notebook import tqdm

K = 10

df_top_k = pd.DataFrame()
movie_indices = df_cos_sim.columns

for col in tqdm(movie_indices):
    df_test = df_cos_sim[[col]].sort_values(by=[col], ascending=False).head(K+1).copy()

    record = []
    for index, row in df_test.iterrows():
        if index != col:
            item = [int(index), float(row[col])]
            record.append(item)
        if len(record) == K:
            break

    df_top_k[col] = record

df_top_k.shape

In [None]:
# Transpose
df_top_k = df_top_k.T

df_top_k.sample(10)

In [None]:
# saving similarity top-k dataframe

df_top_k.to_parquet("../data/movie_top_k_t.parquet")

## Testing

In [None]:
pd.set_option('display.max_colwidth', 50)

In [None]:
query = 'titanic'

df[df["Title"].str.contains("titanic")]

In [None]:
movie_index = 13153 # Titanic Movie

df_query = df_cos_sim[[movie_index]].sort_values(by=[movie_index]).tail(5)
df_query.shape

In [None]:
df_query

In [None]:
df[df.index == 6275]