# Exploratory Data Analysis

In [None]:
import pandas as pd
movies = pd.read_csv("../input/the-movies-dataset/movies_metadata.csv")
movies.head(5)

In [None]:
movies.info()

In [None]:
movies.columns.unique()

In [None]:
movies.isna().sum()

In [None]:
keywords = pd.read_csv("../input/the-movies-dataset/credits.csv")
links = pd.read_csv("../input/the-movies-dataset/links.csv")
links_small = pd.read_csv("../input/the-movies-dataset/links_small.csv")
ratings = pd.read_csv("../input/the-movies-dataset/ratings.csv")
ratings_small = pd.read_csv("../input/the-movies-dataset/ratings_small.csv") 

In [None]:
keywords.tail(-10)

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
ratings_small.head()

In [None]:
ratings_by_movie = ratings_small[["movieId","rating"]]

In [None]:
ratings_by_movie.head()

In [None]:
ratings_by_movie.sort_values("rating", ascending=True)

In [None]:
revenue = movies[["genres","revenue"]]
revenue.head()

In [None]:
ratings_by_movie.hist()

# Simple Recommender

**Now we working on defining a new metric rather than rating as it is often misleading. When we are calculating rating, we dont consider the popularity of a movie. It can happen that we are consider a movie rating of 9 from only 10 voters as 'better' than a movie with 4.7+ rating with 10,000 voters. For this reason, we are caclulating a metric called 'weighted rating' which considers the factor mentioned above.**

In [None]:
#We are calculating the mean of vote average column
C = movies["vote_average"].mean()
print(C)

In [None]:
#calculating the minimum number of votes required to be in the chart, m
m  = movies['vote_count'].quantile(0.75)
print(m)

In [None]:
# Filtering out all qualified movies into a new DataFrame
q_movies = movies.copy().loc[movies['vote_count'] >= m]
q_movies.shape

In [None]:
movies.shape

In [None]:
#function to calculate weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
# Defining a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [None]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

**Using content,Genres & Keywords can also be attempted and might produce better recommenders. In this case, we can use similarity functions, such as - cosine similarity**