In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from datetime import datetime
from dateutil.parser import parse
import ast

# Movie Recommendation Project - Through Correlations With Liked Films And The Year Watching

In this project we will try to predict which movie should be your next movie, what makes our algorithm more interesting is that it takes into consideration the year you're are watching the movie as movie genres popularity is varying depending on the year.

_Because of low-quality data on old movies, we will **not** ignore movies that were released after the year of watching parameter._

### Project outline

- Analyzing IMDb's movies dataset to determine which movie genres are popular based on the year
- Analyzing MovieLens user rating dataset
- (TODO: finish this)

At first, as a POC we will try to guess if a movie is recommended or not with supervised ML based on the genre, runtime, revenue, budget and rating.

Later on the project we will try to predict to a user based on his previous ratings and with the model we built what is the next movie for him to watch.

Some interesting analysis we are hoping to come a cross while working on this:

- Genre popularity based on the year, what was the major event in that year that made the popularity of a genre to raise
- What is the perfect runtime? is there such a thing?
- Is the budget affecting the likelihood of movie success?
- If a movie is plausible, what year is the best year to release it?
- What was the most profitble year to release a movie?


# Lets load our data

#### IMDb's dataset contains the following features:

- tid - IMDb movie id
- IMDb Rating + Rating Count - Average ratings the movie recieved + the count of votes recieved
- Year - Year of release
- One-hot encoding of the genres of each film

#### MovieLens's dataset has the following features:

- adult - Is it an adult film
- budget - How much money was invested
- genres - Genres the movie classified to
- imdb_id - IMDb's ID
- overview - Description of the movie
- popularity - A numeric quantity specifying the movie popularity
- production_companies - The production house of the movie.
- production_countries - The country in which it was produced.
- release_date - The date of the movie release
- revenue - How much revenue the movie generated worldwide
- runtime - How long is the movie in minutes
- tagline - Movie's tagline
- title - Title of the movie
- vote_average - average ratings the movie recieved.
- vote_count - the count of votes recieved.

We also have a dataset of each user and his rating on MovieLens

#### Just a peak at our data

In [None]:
imdb_df = pd.read_csv('https://www.dropbox.com/s/e6qbgjyrlseh2is/imdb.csv?dl=1',error_bad_lines=False,warn_bad_lines=False)
meta_mov_df = pd.read_csv('https://www.dropbox.com/s/j9vxjw3g1s7wqsg/movies_metadata.csv?dl=1')
rating_df = pd.read_csv('https://www.dropbox.com/s/tizyp5zreilielv/ratings.csv?dl=1')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
imdb_df.head()

## Cleaning IMDb dataset

taking a look at anomlies, years where there is low number of movies,
finding the longest sequance of years with movie count higher then 10

In [None]:
# group by year
groupby_year = imdb_df.groupby('year').size().reset_index(name='count').sort_values(['year'])

# for each year, if it has sufficient number of movies
groupby_year['count'] = groupby_year['count'].apply(lambda x: x < 10)

groupby_year.reset_index(inplace=True)

# search for sequence of years with 10 >= movies
groupby_year = groupby_year.groupby(groupby_year['count'].cumsum()).agg(
    {'year':['count', 'min', 'max']})

groupby_year.columns = groupby_year.columns.droplevel()

# get the longest year sequence in the dataset
years_limit = groupby_year[groupby_year['count']==groupby_year['count'].max()]

print('our relevant years are bewteen {} to {}'.format(
    int(years_limit['min'].values),int(years_limit['max'].values))
)

# clean movies out of limit
imdb_df = imdb_df[(imdb_df['year'] > int(years_limit['min'].values)) & (imdb_df['year'] < int(years_limit['max'].values))]

f, axes = plt.subplots(figsize=(20, 7))

# Get a feeling of number of movies per year > 10
sns.distplot(imdb_df['year'], ax=axes, kde=False)
plt.show()

## Cleaning MovieLens dataset

In [None]:
sub_sample_size = int(len(rating_df)/30)
rating_df_sample = rating_df.sample(n=sub_sample_size, random_state=42)

#Data cleaning progress remove of duplicate rows and rows with bad revnue values

In [None]:
#moveis meta data
duplications = len(meta_mov_df['id']) - len(meta_mov_df.drop_duplicates(['id']))
print("<<<=== There are %s unique movies in our Data and (%s) Duplications ===>>>" % (len(meta_mov_df['id'].unique()),duplications))
print("\n")
# After inspections in df we found duplicated rows so we'll drop the to keep unique values
meta_mov_df.drop_duplicates(['id'])

#cleaning bad valued in revnue column
meta_mov_df.dropna(subset=['revenue'],inplace=True)

## Lets start to inspcet our data

In [None]:
# rating_df inspection of rating 
rating_df_sample['date'] = rating_df_sample.apply(lambda x: str(datetime.fromtimestamp(x['timestamp']).strftime('%Y-%m-%d')),axis=1)
rating_df_sample['rate_year'] = rating_df_sample.apply(lambda x: parse(x['date']).year,axis=1)

# dist of year
print('<<<=== lets check out the movies rating years ===>>>')
sns.distplot(rating_df_sample['rate_year'],rug=True, rug_kws={"color": "g"},
                  kde_kws={"color": "k", "lw": 3, "label": "KDE"},
                  hist_kws={"histtype": "step", "linewidth": 3,
                            "alpha": 1, "color": "g"})

In [None]:
# calulate ratings means distributions
ratings_mean = rating_df_sample.groupby(by=['movieId'])['rating'].mean()
sns.kdeplot(ratings_mean,shade=True)

In [None]:
# Now Lets get started to meet meta our Data
meta_mov_df['popularity'] = pd.to_numeric(meta_mov_df['popularity'],errors='coerce')
sns.kdeplot(meta_mov_df['popularity'], shade=True)

# Now lets set our metric to score the movies rating

First of all, we can see that the vote_average feature, which represents the average ratings that the movie recieved.
Its not comperable as the vote_count is differnt 10k pepole who their avg vote is e.g 6.6 is not the same for 6.6 for jsut 10 pepole voted for it, so normalization would be needed, as well as getting the ratings them selves.

In [None]:
sns.scatterplot(y='vote_count', x='vote_average',data=meta_mov_df[['vote_count','vote_average']])

another good assumption would be to check the movies budget compared to their revenue
as we assume that if the revenue is higher then the budget so the rating would be higher for such moive as revenue/budget > 1



In [None]:
def calc_rev_rate(row):
    return float(row['revenue']) / float(row['budget']) if float(row['budget']) > 0 else 1

meta_mov_df['revenue_rate'] = meta_mov_df.apply(calc_rev_rate,axis=1)

plt.hist(meta_mov_df['revenue_rate'],bins=[0,2/3,4/3,2])
plt.title('My title')
plt.xlabel('ratio revenue/budget')
plt.ylabel('number of movies')

plt.show()

### Normalizing our ratings

[based on IMDb's formula](https://help.imdb.com/article/imdb/track-movies-tv/ratings-faq/G67Y87TFYYP6TWAV#)

How do you calculate the rank of movies and TV shows on the Top Rated Movies and Top Rated TV Show lists?
The following formula is used to calculate the Top Rated 250 titles. This formula provides a true 'Bayesian estimate', which takes into account the number of votes each title has received, minimum votes required to be on the list, and the mean vote for all titles:

`weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C`

Where:

R = average for the movie (mean) = (rating)

v = number of votes for the movie = (votes)

m = minimum votes required to be listed in the Top Rated list (currently 25,000)

C = the mean vote across the whole report

Please be aware that the Top Rated Movies Chart only includes theatrical features: shorts, TV movies, miniseries and documentaries are not included in the Top Rated Movies Chart. The Top Rated TV Shows Chart includes TV Series, but not TV episodes or Movies.

In [None]:
def calculate_weighted_rate(row):
    m = meta_mov_df['vote_count'].quantile(0.9)
    c = meta_mov_df['vote_average'].mean()
    v = row['vote_count']
    r = row['vote_average']
    res = (v / (v+m)) * r + (m / (v+m)) * c
    return res

meta_mov_df['Weighted_Rate'] = meta_mov_df.apply(calculate_weighted_rate,axis=1 )
print(meta_mov_df['Weighted_Rate'])

# Lets create our new film Data Set to try to predics a recommended Movies


First start with parsing the genre column to one-hot encoding for the genre categories.

As a POC, let's consider a recommended movie a movie with WR higher than the mean WR in the dataset.

In [None]:
movie_genre_df = meta_mov_df[['id','imdb_id','title','budget','revenue','genres','runtime','release_date','vote_average',
                              'vote_count','Weighted_Rate','popularity']]

def get_values_from_genre_json(row, genre):
    movies_many_genres = ast.literal_eval(row['genres'])
    movies_genres = []
    for item in movies_many_genres:
        movies_genres.append(item['name'])
    if genre in movies_genres:
        return 1
    else:
        return 0


genres_list = set()
for index, value in movie_gen_df['genres'].iteritems():
    movies_genres = ast.literal_eval(value)
    for item in movies_genres:
        genres_list.add(item['name'])
genres_list = list(genres_list)

for genre in genres_list:
    movie_gen_df[genre] = movie_gen_df.apply(lambda x: get_values_from_genre_json(x, genre), axis=1)


### More Data cleaning

In [None]:
movie_gen_df = movie_gen_df.drop(columns=['genres', 'id', 'imdb_id', 'title'])
# movie_gen_df = movie_gen_df.dropna(subset=['release_date'])
movie_gen_df = movie_gen_df[movie_gen_df['popularity'] != 'Beware Of Frost Bites']
#movie_gen_df['release_date'] = movie_gen_df['release_date'].apply(dateutil.parser.parse)

#columns = ['release_date']


# here we set the recomnded film column based on mean
wr_mean = movie_gen_df['Weighted_Rate'].mean()

movie_gen_df['recomnded'] = movie_gen_df.apply(lambda x: 'True' if x['Weighted_Rate']>=we_mean else 'False' ,axis=1)
print(movie_gen_df['recomnded']



# now lets try to cluster some movies together

In [None]:
from sklearn.cluster import KMeans

# Set concoder
encoder = LabelEncoder()

# Encode data frame
encoded_df = df.copy()
for col in columns:
    encoded_df[col] = encoder.fit_transform(df[col])

y_pred = KMeans(n_clusters=2, random_state=0).fit_predict(movie_gen_df)
plt.subplot(221)
plt.scatter(movie_gen_df, movie_gen_df, c=y_pred)
plt.title("Incorrect Number of Blobs")
