In [None]:
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load Data
credits = pd.read_csv("tmdb_5000_credits.csv")
movies_df = pd.read_csv("tmdb_5000_movies.csv")

# Merge Data
credits.rename(index=str, columns={"movie_id": "id"}, inplace=True)
movies_df = movies_df.merge(credits, on='id')

# Extract Features
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(literal_eval)

# Get director's name
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

movies_df['director'] = movies_df['crew'].apply(get_director)

# Apply get_list to cast, keywords and genres
features = ['cast', 'keywords', 'genres']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(get_list)

# Clean and create a metadata soup
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

for feature in ['cast', 'keywords', 'director', 'genres']:
    movies_df[feature] = movies_df[feature].apply(clean_data)

movies_df['soup'] = movies_df.apply(create_soup, axis=1)

# Vectorize the soup
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies_df['soup'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# Reset index of our main DataFrame and construct reverse mapping as before
movies_df = movies_df.reset_index()
indices = pd.Series(movies_df.index, index=movies_df['original_title'])

# Recommendation function
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies_df['original_title'].iloc[movie_indices]

# Testing the improved system
recommendations = get_recommendations('Tangled')
print("Recommended Movies for {}:".format('Tangled'))
print(recommendations)


In [None]:
# Testing the improved system
recommendations = get_recommendations('Tangled')
print("Recommended Movies for {}:".format('The Dark Knight Rises'))
print(recommendations)

In [None]:
import pickle

# Save the DataFrame with the 'soup' column
pickle.dump(movies_df, open('improved_movie_list.pkl', 'wb'))
# Save the new cosine similarity matrix
pickle.dump(cosine_sim, open('improved_similarity.pkl', 'wb'))