## Importing the libraries

In [15]:
import numpy as np
import pandas as pd
import os
import json
import re
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from scipy.sparse import csr_matrix, save_npz

## Importing the datasets

In [16]:
movie_dataset = pd.read_csv('dataset/tmdb_5000_movies.csv')
credits_dataset = pd.read_csv('dataset/tmdb_5000_credits.csv')

## Merging the datasets

In [17]:
full_dataset = pd.merge(movie_dataset, credits_dataset, on='title')

## Keeping only relevant columns

In [18]:
dataset = full_dataset[["genres", "id", "keywords", "overview", "title", "cast", "crew"]]

## Removing missing data

In [19]:
dataset = dataset.dropna()

## Refining the columns

In [20]:
def refine_genres_keywords(genres):
    l = []
    for genre in json.loads(genres):
        l.append(genre["name"].replace(" ", ""))
    return " ".join(l).lower()

dataset["genres"] = dataset["genres"].apply(refine_genres_keywords)
dataset["keywords"] = dataset["keywords"].apply(refine_genres_keywords)

def refine_cast(args):
    l = []
    for index, arg in enumerate(json.loads(args)):
        l.append(arg["name"].replace(" ", ""))
        if index == 2: break
    return " ".join(l).lower()

dataset["cast"] = dataset["cast"].apply(refine_cast)

def refine_crew(crews):
    for crew in json.loads(crews):
        if crew["job"] == "Director":
            return crew["name"].replace(" ", "").lower()
    return None

dataset["crew"] = dataset["crew"].apply(refine_crew)

## Cleaning overview column

In [21]:
stop_words = set(stopwords.words('english'))
negations = {"not", "no", "nor", "n't", "never", "none"}
stop_words = stop_words - negations
ps = PorterStemmer()

def refine_overview(overview):
    overview = re.sub("[^a-zA-Z0-9]", " ", overview)
    overview  = overview.lower()
    overview = overview.split()
    overview = [ps.stem(word) for word in overview if word not in stop_words]
    return " ".join(overview)

dataset["overview"] = dataset["overview"].apply(refine_overview)

## Creating new column for NLP

In [22]:
dataset = dataset.fillna("")
dataset["tags"] = dataset["overview"] + dataset["genres"] + dataset["keywords"] + dataset["crew"] + dataset["cast"]
dataset = dataset[["title", "id", "tags"]]

## Vectorizing the tags column

In [23]:
cv = CountVectorizer(max_features=5000)
vector = cv.fit_transform(dataset["tags"])

## Saving only the needed data

In [24]:
os.makedirs("./app/artifacts", exist_ok=True)
pickle.dump(dataset[["title", "id"]], open('./app/artifacts/movie_list.pkl', 'wb'))
save_npz('./app/artifacts/vector_matrix.npz', csr_matrix(vector))