In [1]:
import matplotlib as plt
import numpy as np
import pandas as pd

In [2]:
DATASET_DIR = 'ml-latest-small/'

movies_df = pd.read_csv(DATASET_DIR + 'movies.csv')
ratings_df = pd.read_csv(DATASET_DIR + 'ratings.csv')
links_df = pd.read_csv(DATASET_DIR + 'links.csv')
tags_df = pd.read_csv(DATASET_DIR + 'tags.csv')

In [3]:
# Analysing the columns for commonalities

print('links:  ', links_df.columns.sort_values())
print('movies: ', movies_df.columns.sort_values())
print('ratings:', ratings_df.columns.sort_values())
print('tags:\t', tags_df.columns.sort_values())

links:   Index(['imdbId', 'movieId', 'tmdbId'], dtype='object')
movies:  Index(['genres', 'movieId', 'title'], dtype='object')
ratings: Index(['movieId', 'rating', 'timestamp', 'userId'], dtype='object')
tags:	 Index(['movieId', 'tag', 'timestamp', 'userId'], dtype='object')


In [4]:
# Merged df of movies and ratings, grouped by movieId

grouped_movies = movies_df.merge(ratings_df, on='movieId', how='inner').groupby('movieId')

In [5]:
# Returns (movieId, title) based on the search query if found, else None

def get_movie_id_and_name(query):
    query = query.lower()
    for i,title in enumerate(movies_df['title']):
        if title.lower().startswith(query):
            return (movies_df['movieId'][i], title)

In [6]:
# Fetching 'black swan' based on search query

black_swan = get_movie_id_and_name('black swan')
print(black_swan)

(81591, 'Black Swan (2010)')


In [7]:
# Ratings for 'black swan'

black_swan_ratings = grouped_movies.get_group(black_swan[0])
print(black_swan_ratings.head())
black_swan_ratings = black_swan_ratings['rating']
black_swan_ratings.describe()

       movieId              title          genres  userId  rating   timestamp
95220    81591  Black Swan (2010)  Drama|Thriller      15     1.0  1338698369
95221    81591  Black Swan (2010)  Drama|Thriller      48     4.0  1367475412
95222    81591  Black Swan (2010)  Drama|Thriller      56     5.0  1467003145
95223    81591  Black Swan (2010)  Drama|Thriller      62     4.0  1451713153
95224    81591  Black Swan (2010)  Drama|Thriller      72     3.5  1461784449


count    34.000000
mean      3.779412
std       0.962881
min       1.000000
25%       3.500000
50%       4.000000
75%       4.000000
max       5.000000
Name: rating, dtype: float64

In [8]:
# Returns ratings dataframe for a movie name query

def get_ratings_for(movie_query):
    movie = get_movie_id_and_name(movie_query)
    return grouped_movies.get_group(movie[0]) if movie else None

get_ratings_for('black swan')['rating'].mean()

3.7794117647058822

In [9]:
# Apparently, there are built-in functionalities that could help with searching.

def search_movies(query):
    query = query.lower()
    lowered_movie_titles = movies_df.copy()
    lowered_movie_titles.title = lowered_movie_titles.title.str.lower()
    contains = movies_df[lowered_movie_titles.title.str.contains(query)]
    startswith = movies_df[lowered_movie_titles.title.str.startswith(query)]
    return pd.concat([startswith, contains]).drop_duplicates().reset_index()

search_movies('network')

Unnamed: 0,index,movieId,title,genres
0,2798,3504,Network (1976),Comedy|Drama
1,7618,80463,"Social Network, The (2010)",Drama


In [10]:
# Returns sorted search results based on each result's mean_rating value (Descending Order)

def ratings_sorted_search_results(search_result):
    mean_ratings = []
    movie_indices = []
    for i in search_result.index:
        movie_index = search_result['index'][i] # 'index' of movie as in movies_df
        mean_rating = grouped_movies.get_group(search_result['movieId'][i])['rating'].mean()
        # Appending to respective list
        mean_ratings.append(mean_rating)
        movie_indices.append(movie_index)
        
    # Dataframe with mean_rating and movie_indices
    mean_ratings_df = pd.DataFrame({'index': movie_indices, 'mean_rating': mean_ratings})
    
    # Merged df
    merged_df = search_result.merge(mean_ratings_df, on='index')
    
    # Sort in descending order by mean_rating
    sorted_merged_df = merged_df.sort_values(by='mean_rating', ascending=False).reset_index(drop=True)
    
    return sorted_merged_df

In [11]:
rambo_movies = search_movies('rambo')
ratings_sorted_search_results(rambo_movies)

Unnamed: 0,index,movieId,title,genres,mean_rating
0,6880,57528,Rambo (Rambo 4) (2008),Action|Drama|Thriller|War,3.583333
1,6938,59141,Son of Rambow (2007),Children|Comedy|Drama,3.5
2,1913,2403,First Blood (Rambo: First Blood) (1982),Action|Adventure|Drama|Thriller,3.472222
3,1912,2402,Rambo: First Blood Part II (1985),Action|Adventure|Thriller,2.535714
4,1914,2404,Rambo III (1988),Action|Adventure|Thriller|War,2.230769
