# Recommendation System
## Knowledge Based
#### (Soumitra Dnyaneshwar Edake)

In [76]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime

%matplotlib inline

### Build the DataFrames

In [77]:
def get_cleaned_df():
    path_movies = 'data/movies_clean.csv'
    path_reviews = 'data/reviews_clean.csv'
    
    if not os.path.exists(path_reviews):
        
        print('Cleaning REVIEW data')
        
        reviews = pd.read_csv('data/ratings.dat', delimiter='::', header=None,
                              names=['user_id', 'movie_id', 'rating', 'timestamp'],
                              dtype={'movie_id': object, 'user_id': object, 'timestamp': object},
                              engine='python')
        
        change_timestamp = lambda val: datetime.datetime.fromtimestamp(int(val)).strftime('%Y-%m-%d %H:%M:%S')

        reviews['date'] = reviews['timestamp'].apply(change_timestamp)
        
        print("Saving cleaned REVIEW data")    
        reviews.to_csv('data/reviews_clean.csv')

    else:
        
        print('Loading cleaned REVIEW data')
        reviews = pd.read_csv('data/reviews_clean.csv')
        
    if not os.path.exists(path_movies):
        
        print('Cleaning MOVIE data')
        
        movies = pd.read_csv('data/movies.dat', delimiter='::', header=None,
                             names=['movie_id', 'movie', 'genre'],
                             dtype={'movie_id': object},
                             engine='python')
          
        movies = dummy_genres(movies)
        movies = dummy_dates(movies)
        
        print("Saving cleaned MOVIE data")
        movies.to_csv('data/movies_clean.csv')
   
    else:
        print('Loading cleaned MOVIE data')
        movies = pd.read_csv('data/movies_clean.csv')
        
    print('done')
    
    return movies, reviews

In [78]:
def get_split_genres(genres):
    
    def split_genres(val):
        try:
            if val.find(gene) >-1:
                return 1
            else:
                return 0
        except AttributeError:
            return 0
    
    for gene in genres:        
        movies[gene] = movies['genre'].apply(split_genres)
        
    return movies
        
def dummy_genres(movies):
    genres = list()
        
    for val in movies.genre:
        try:
            genres.extend(val.split('|'))
        except AttributeError:
            pass
    
    genres = set(genres)
    
    movies = get_split_genres(genres)
    
    return movies

In [79]:
def get_add_movie_year():

    def add_movie_year(val):
        if val[:2] == yr:
            return 1
        else:
            return 0
    for yr in ['18', '19', '20']:
        movies[str(yr) + "00's"] = movies['date'].apply(add_movie_year)
        
    return movies
            
def dummy_dates(movies):
    create_date = lambda val: val[-5:-1] if val[-1] == ')' else np.nan

    movies['date'] = movies['movie'].apply(create_date)

    movies = get_add_movie_year()
        
    return movies

Load Cleaned DataFrames

In [90]:
movies, reviews = get_cleaned_df()

Loading cleaned REVIEW data
Loading cleaned MOVIE data
done


In [91]:
movies.columns, reviews.columns

(Index(['Unnamed: 0', 'movie_id', 'movie', 'genre', 'date', '1800's', '1900's',
        '2000's', 'Comedy', 'Crime', 'Drama', 'Thriller', 'Action', 'Game-Show',
        'Adult', 'Sci-Fi', 'Film-Noir', 'Musical', 'History', 'Fantasy',
        'Family', 'Animation', 'Adventure', 'News', 'Mystery', 'Talk-Show',
        'Sport', 'Romance', 'Western', 'Horror', 'War', 'Short', 'Reality-TV',
        'Music', 'Documentary', 'Biography'],
       dtype='object'),
 Index(['Unnamed: 0', 'user_id', 'movie_id', 'rating', 'timestamp', 'date'], dtype='object'))

In [92]:
del movies['Unnamed: 0']
del reviews['Unnamed: 0']

In [93]:
movies.columns, reviews.columns

(Index(['movie_id', 'movie', 'genre', 'date', '1800's', '1900's', '2000's',
        'Comedy', 'Crime', 'Drama', 'Thriller', 'Action', 'Game-Show', 'Adult',
        'Sci-Fi', 'Film-Noir', 'Musical', 'History', 'Fantasy', 'Family',
        'Animation', 'Adventure', 'News', 'Mystery', 'Talk-Show', 'Sport',
        'Romance', 'Western', 'Horror', 'War', 'Short', 'Reality-TV', 'Music',
        'Documentary', 'Biography'],
       dtype='object'),
 Index(['user_id', 'movie_id', 'rating', 'timestamp', 'date'], dtype='object'))

Now we have our Dataset

In [94]:
movies.head()

Unnamed: 0,movie_id,movie,genre,date,1800's,1900's,2000's,Comedy,Crime,Drama,...,Sport,Romance,Western,Horror,War,Short,Reality-TV,Music,Documentary,Biography
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,1894,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,10,La sortie des usines LumiÃ¨re (1895),Documentary|Short,1895,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,12,The Arrival of a Train (1896),Documentary|Short,1896,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,25,The Oxford and Cambridge University Boat Race ...,,1895,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,91,Le manoir du diable (1896),Short|Horror,1896,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [95]:
reviews.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,date
0,1,114508,8,1381006850,2013-10-06 02:30:50
1,2,208092,5,1586466072,2020-04-10 02:31:12
2,2,358273,9,1579057827,2020-01-15 08:40:27
3,2,10039344,5,1578603053,2020-01-10 02:20:53
4,2,6751668,9,1578955697,2020-01-14 04:18:17


### Ranking System

In [97]:
def create_ranked_df(movies, reviews):
    
    movie_ratings = reviews.groupby('movie_id')['rating']
    avg_ratings = movie_ratings.mean()
    num_ratings = movie_ratings.count()
    last_rating = pd.DataFrame(reviews.groupby('movie_id').max()['date'])
    last_rating.columns = ['last_rating']

    rating_count_df = pd.DataFrame({'avg_rating': avg_ratings, 'num_ratings': num_ratings})
    rating_count_df = rating_count_df.join(last_rating)

    movie_recs = movies.set_index('movie_id').join(rating_count_df)
    
    ranked_movies = movie_recs.sort_values(['avg_rating', 'num_ratings', 'last_rating'], ascending=False)
    ranked_movies = ranked_movies[ranked_movies['num_ratings'] > 4]

    return ranked_movies
    

def popular_recommendations(user_id, n_top, ranked_movies):
    
    top_movies = list(ranked_movies['movie'][:n_top])
    
    return top_movies

Using above functions, we get the array of movies ordered by popularity

In [98]:
ranked_movies = create_ranked_df(movies, reviews)

In [None]:
ranked_movies.

Now lets make some recommendations

In [29]:
user = '1202'
no_of_recommendations = 5

popular_recommendations(user, no_of_recommendations, ranked_movies)

['MSG 2 the Messenger (2015)',
 'Avengers: Age of Ultron Parody (2015)',
 'Five Minutes (2017)',
 'Selam (2013)',
 'Let There Be Light (2017)']

In [30]:
user = '4302'
no_of_recommendations = 10

popular_recommendations(user, no_of_recommendations, ranked_movies)

['MSG 2 the Messenger (2015)',
 'Avengers: Age of Ultron Parody (2015)',
 'Five Minutes (2017)',
 'Selam (2013)',
 'Let There Be Light (2017)',
 "Quiet Riot: Well Now You're Here, There's No Way Back (2014)",
 'Crawl Bitch Crawl (2012)',
 'Chasing Happiness (2019)',
 'Make Like a Dog (2015)',
 'Pandorica (2016)']

For now, there is no use of user_id in the recommendation system, as we are
ranking the movies based on the knowledge we have like ratings and genre

### Adding a Filter system

In [16]:
def filter_rec(user_id, n_top, ranked_movies, years=None, genres=None):
    
    if years is not None:
        ranked_movies = ranked_movies[ranked_movies['date'].isin(years)]

    if genres is not None:
        num_genre_match = ranked_movies[genres].sum(axis=1)
        ranked_movies = ranked_movies.loc[num_genre_match > 0, :]
            
            
    # create top movies list 
    top_movies = list(ranked_movies['movie'][:n_top])

    return top_movies


In [17]:
filter_rec('1', 20, ranked_movies, years=['2015', '2016', '2017', '2018'], genres=['History'])

["Hillary's America: The Secret History of the Democratic Party (2016)",
 'I Believe in Miracles (2015)',
 'O.J.: Made in America (2016)',
 'Ayla: The Daughter of War (2017)',
 'Hacksaw Ridge (2016)',
 'They Shall Not Grow Old (2018)',
 'Namhansanseong (2017)',
 'The Farthest (2017)',
 'Kono sekai no katasumi ni (2016)',
 'Sado (2015)',
 'Silicon Cowboys (2016)',
 '13th (2016)',
 'Ethel &amp; Ernest (2016)',
 'Paul, Apostle of Christ (2018)',
 'Kincsem (2017)',
 'LA 92 (2017)',
 'Straight Outta Compton (2015)',
 'Nise - O CoraÃ§Ã£o da Loucura (2015)',
 'Under sandet (2015)',
 'Only the Dead (2015)']

In [18]:
filter_rec('53968', 5, ranked_movies, years=['2015', '2016', '2017', '2018'])

['MSG 2 the Messenger (2015)',
 'Avengers: Age of Ultron Parody (2015)',
 'Five Minutes (2017)',
 'Let There Be Light (2017)',
 'Make Like a Dog (2015)']

In [20]:
filter_rec('70000', 10, ranked_movies, genres=['History', 'News'])

['Birlesen Gonuller (2014)',
 'Mad As Hell (2014)',
 "Hillary's America: The Secret History of the Democratic Party (2016)",
 'The Decline of Western Civilization (1981)',
 'Night Will Fall (2014)',
 "La passion de Jeanne d'Arc (1928)",
 'Ningen no jÃ´ken (1959)',
 'The Message (1977)',
 'Amadeus (1984)',
 'I Believe in Miracles (2015)']