In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 

df1=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
df2=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

In [None]:
df2.rename(columns={'id':'movie_id'}, inplace=True)
print(df2.columns)
df2.head()

In [None]:
print(df1.columns)
df1.head()

In [None]:
movies=df2.merge(df1,on='movie_id')
movies.head()

# Let's try the simplest recommendation algorithm first; **Demographic Filtering**

* Demographic Filtering (DF) technique uses the demographic data of a user to determine which items may be appropriate for recommendation. 
* This require a metric based on which movies can be rated.
* After sorting based on the metric, we recommend the top movies to the user.
* Generally used matric for this purpose is: weighted rating (wr) 

![](https://image.ibb.co/jYWZp9/wr.png).

Here, 
* v is the number of votes for the movie;
* m is the minimum votes required to be listed in the chart;
* R is the average rating of the movie; And
* C is the mean vote across the whole report

v(**vote_count**) and R (**vote_average**) are listed in our dataset.

C is calculated by finding the mean of the average rating of all ovies

We also pick an **m** value. This can be set as a quantile of the **vote_count** data. We pick 0.8 quantile.



In [None]:
C=movies['vote_average'].mean()
print('C = ', C)

m=movies['vote_count'].quantile(0.8)
print('m = ', m)

In [None]:
data=movies[movies['vote_count']>m]


### We create a **score** to evaluate the movies using the WR, then sort the dataframe based on the score. Then pick the top 10 movies to feature.

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

data['score']=data.apply(weighted_rating, axis=1)
data=data.sort_values('score', ascending=False)

In [None]:
data[['original_title', 'vote_count', 'vote_average', 'score']].head(10)