Usefull stuff:

script to clean IMDB reviews and do sentiment analysis
https://github.com/jaypatel00174/moviesentiment_analysis/blob/master/sentiment.py

Hands-On Recommendation Systems with Python
https://www.amazon.com/Hands-Recommendation-Systems-Python-recommendation/dp/1788993756/ref=sr_1_fkmrnull_6?ajr=2&crid=19F7GJ0JYV2AD&keywords=hands-on+recommendation+systems+with+python&linkCode=sl2&linkId=96a355cac564ae8ae2ea24bee599beb7&qid=1547756269&sprefix=hands+on+recommenda%2Caps%2C559&sr=8-6-fkmrnull&tag=tutorialedge-20

ncie script to download movie data from cli
https://github.com/prateekkalra/movie-py-cli/blob/master/movie-py.py

ml on movie ratings
https://www.kaggle.com/juanjotwo/deep-learning-with-python-notebooks-3-5-imdb

the Kaggle site with 1000+ kernels
https://www.kaggle.com/tmdb/tmdb-movie-metadata



### Build a recommender system on IMDB 

https://www.kaggle.com/fabiendaniel/film-recommendation-engine

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import math, nltk, warnings
from nltk.corpus import wordnet
from sklearn import linear_model
from sklearn.neighbors import NearestNeighbors

pd.set_option('display.max_columns', 100)

PATH = 'data/'
FILE_CREDITS = 'tmdb_5000_credits.csv'
FILE_MOVIES = 'tmdb_5000_movies.csv'

In [2]:
df = pd.read_csv(PATH + FILE_MOVIES)

In [16]:
def pipe_names(keywords):
    return '|'.join([x['name'] for x in keywords])

def read_movies(file):
    df = pd.read_csv(file)
    
    # date to Dutch format
    df['release_date'] = (df['release_date']
                          .apply(lambda x: pd.to_datetime(x, format = '%Y-%m-%d')))
    df['release_decade'] = df['release_year'].apply(lambda x:((x-1900)//10)*10)
    df['release_year'] = df['release_date'].dt.year.astype(float).map('{:.0f}'.format)
    df['release_date'] = df['release_date'].dt.strftime('%d-%m-%Y')
    
    json_columns = ['genres', 'keywords', 'production_countries',
                        'production_companies', 'spoken_languages']
    # to unquote, remove string
    for col in json_columns:
        df[col] = df[col].apply(json.loads)
        
    # unnest dict_lists and rename columns
    df['genre_name'] = df['genres'].apply(pipe_names)
    df['genre_id'] = df['genres'].apply(lambda x: [d.get('id') for d in x])
    df['keywords'] = df['keywords'].apply(pipe_names)
    df['prod_co_name'] = df['production_companies'].apply(lambda x: [d.get('name') for d in x])
    df['prod_co_id'] = df['production_companies'].apply(lambda x: [d.get('id') for d in x])
    df['language'] = (df['spoken_languages']
                      .apply(lambda x: [d.get('iso_639_1') for d in x]))
    df['prod_countries'] = (df['production_countries']
                            .apply(lambda x: [d.get('iso_3166_1') for d in x]))
    
    drop_cols = ['production_companies', 'spoken_languages', 'production_countries', 
                 'genres', 'homepage']
    for col in drop_cols:
        df = df.drop(col, axis=1)
 
    return df

In [17]:
movies = read_movies(PATH + FILE_MOVIES)

KeyError: 'release_year'

In [5]:
movies.head(2)

Unnamed: 0,budget,id,keywords,original_language,original_title,overview,popularity,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count,release_year,genre_name,genre_id,prod_co_name,prod_co_id,language,prod_countries
0,237000000,19995,culture clash|future|space war|space colony|so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,10-12-2009,2787965087,162.0,Released,Enter the World of Pandora.,Avatar,7.2,11800,2009,Action|Adventure|Fantasy|Science Fiction,"[28, 12, 14, 878]","[Ingenious Film Partners, Twentieth Century Fo...","[289, 306, 444, 574]","[en, es]","[US, GB]"
1,300000000,285,ocean|drug abuse|exotic island|east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,19-05-2007,961000000,169.0,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,2007,Adventure|Fantasy|Action,"[12, 14, 28]","[Walt Disney Pictures, Jerry Bruckheimer Films...","[2, 130, 19936]",[en],[US]


In [6]:
def access(container, index_values):
    # return NaN rather than an error upon indexing/key failure
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return np.nan

def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return access(directors, [0])


def read_credits(file):
    df = pd.read_csv(file)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    
    df['actor_1_name'] = df['cast'].apply(lambda x: access(x, [1, 'name']))
    df['actor_2_name'] = df['cast'].apply(lambda x: access(x, [2, 'name']))
    df['actor_3_name'] = df['cast'].apply(lambda x: access(x, [3, 'name']))
    df['director'] = df['crew'].apply(get_director)
    
    return df

In [7]:
df_cred = read_credits(PATH + FILE_CREDITS)

### exploring keywords

To develop a recommendation engine we make extensive use of the keywords that describe the films. Basic assumption is that films described by similar keywords should have similar content. Hence, the way keywords are defined is a 1st step.

In [8]:
# list the keywords which are in the dataset
movies['keywords'][0]

'culture clash|future|space war|space colony|society|space travel|futuristic|romance|space|alien|tribe|alien planet|cgi|marine|soldier|battle|love affair|anti war|power relations|mind and soul|3d'

In [9]:
# create unique set of keywords
set_kw = set()

for kw in movies['keywords'].str.split('|').values:
    if isinstance(kw, float): continue
    set_kw = set_kw.union(kw)
set_kw.remove('')

In [10]:
def count_word(df, ref_col, kw_set):
    """
    count key word occurences in pd series
    Args:
     df : Pandas DF
     ref_col: Series containing piped keywords
     kw_set : reference unique keyword set
    """
    kw_count = {}
    for s in kw_set: kw_count[s] = 0
    for kws in df[ref_col].str.split('|'):        
        if type(kws) == float and pd.isnull(kws): continue        
        for s in [s for s in kws if s in kw_set]: 
            if pd.notnull(s): kw_count[s] += 1
    
    # convert dict --> list and sort kw frequency
    kw_occurences = []
    for k,v in kw_count.items():
        kw_occurences.append([k,v])
    kw_occurences.sort(key = lambda x:x[1], reverse = True)
    
    return kw_occurences, kw_count

In [11]:
kw_occurences, kw_count = count_word(df= movies, ref_col='keywords', kw_set=set_kw)