In [1]:
import json
import requests

import pandas as pd
import numpy as np

### Load the data and process the data.

In [197]:
data = pd.read_csv('ml-latest/movies.csv', index_col='movieId')
links = pd.read_csv('ml-latest/links.csv', index_col='movieId')

# Split genre strings into a list.
data.genres = data.genres.map(lambda x: x.split('|'))

# Split the title into a release year and title.
data['release_year'] = data.title.map(lambda x: x[x.find('(')+1:x.find(')')])
data['title'] = data.title.map(lambda x: x[:x.find('(')])

# Create a binary list of columns.
binary_genres = pd.get_dummies(data.genres.explode()).sum(level=0)

# Create a dataframe for the dataset and load the user genome tags.
data = pd.concat([data, links, binary_genres], axis=1)
genome_scores = pd.read_csv('ml-latest/genome-scores.csv')
genome_tags = pd.read_csv('ml-latest/genome-tags.csv')
ratings = pd.read_csv('ml-latest/ratings.csv')

In [3]:
'''
Use the IMDB alternative API to populate the movie info dataframe.
'''
def scrape_movie(imdb_id):
    url = 'https://movie-database-imdb-alternative.p.rapidapi.com/'
    headers = {
        'x-rapidapi-host': 'movie-database-imdb-alternative.p.rapidapi.com',
        'x-rapidapi-key': 'ee527b80e2mshe9e5a5340654e93p10be67jsn49070371b611'
    }
    querystring = {'i': 'tt'+imdb_id.zfill(7), 'r': 'json'}
    response = requests.request('GET', url, headers=headers, params=querystring)
    return json.loads(response.text)

In [4]:
'''
Get the Rotten Tomato score from the JSON response.
'''
def get_rt_score(movie_data):
    if 'Ratings' in movie_data.keys():
        movie_ratings = movie_data['Ratings']
        raters = [rating['Source'] for rating in movie_ratings]
        if 'Rotten Tomatoes' in raters:
            rt_index = next((ind for (ind, d) in enumerate(movie_ratings) if d['Source'] == 'Rotten Tomatoes'), None)
            return movie_ratings[rt_index]['Value']
        else:
             return np.nan

In [5]:
'''
Get the relevent field from the JSON response.
'''
def get_field(movie_data, field):
    if field in movie_data.keys():
         return movie_data[field]
    else:
         return np.nan

### Scrape a small set of example movie data.

In [36]:
columns = ['Title', 'Year', 'Rated', 'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Awards', 'Poster', 'imdbRating', 'imdbVotes', 'Metascore', 'BoxOffice']
#imdb_ids = data.imdbId.values
#imdb_ids = imdb_ids[scraped_data.Title.values==0]
i = (scraped_data.Title.values==0).argmax()-1
#scraped_data = pd.DataFrame(np.zeros([imdb_ids.shape[0], len(columns)+2]), columns=['imdbId']+columns+['rtRating'])

start = time.time()
for _, imdb in enumerate(imdb_ids):
    movie_data = scrape_movie(str(imdb))
    # Populate the regular fields.
    for field in columns:
        scraped_data.loc[i, field] = get_field(movie_data, field)
    # Add the rotten tomatoe rating.
    scraped_data.loc[i, 'rtRating'] = get_rt_score(movie_data)
    
    # Print progress.
    if i%1000 == 0 and i != 0:
        avg_time = (time.time()-start)/i
        print('Item number: '+str(i))
    i +=1

Item number: 41000
Item number: 42000
Item number: 43000
Item number: 44000
Item number: 45000
Item number: 46000
Item number: 47000
Item number: 48000
Item number: 49000
Item number: 50000
Item number: 51000
Item number: 52000
Item number: 53000
Item number: 54000
Item number: 55000
Item number: 56000
Item number: 57000
Item number: 58000


In [38]:
#scraped_data.to_csv('scraped_data.csv')`

### Identify movies that have both at least 1,000 IMDB votes and 100 ratings.

In [177]:
# Change the data format of the imdbVotes column to int.
scraped_data = pd.read_csv('scraped_data.csv')
scraped_data['imdbVotes'] = scraped_data.loc[:, 'imdbVotes'].str.replace(',', '').astype(int, errors='ignore')
scraped_data['imdbVotes'] = pd.to_numeric(scraped_data['imdbVotes'], errors ='coerce')

# Get a tentative list of movies that have more than 1,000 IMDB votes.
good_movie_inds = scraped_data.imdbVotes>1000
good_movie_info = scraped_data.iloc[good_movie_inds.values, :]

# Get rid of all movies with less than 100 reviews.
movie_ids, counts = np.unique(ratings.movieId.values, return_counts=True)
good_movie_ids = movie_ids[counts>100]
good_movie_ratings = ratings.iloc[ratings.movieId.isin(good_movie_ids).values,:]

# Add the imdbId to the reviews dataframe.
movieId_data = pd.read_csv('ml-latest/links.csv')
movie_map = {movie.movieId: movie.imdbId for _, movie in movieId_data.iterrows()}
good_movie_ratings.insert(5, 'imdbId', good_movie_ratings.movieId.map(movie_map))

# Find only the movies that have both at least 1,000 IMDB votes and 100 ratings.
good_imdb_ids1 = good_movie_ratings.imdbId.unique()
good_imdb_ids2 = good_movie_info.imdbId.values
good_movie_imdb_ids = np.intersect1d(good_imdb_ids1, good_imdb_ids2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [183]:
# NOW get the final movie info and ratings dataframes by filtering using these criterion.
ratings_inds = good_movie_ratings.imdbId.isin(good_movie_imdb_ids)
info_inds = good_movie_info.imdbId.isin(good_movie_imdb_ids)
good_movie_ratings = good_movie_ratings.loc[ratings_inds, :]
good_movie_info = good_movie_info.loc[info_inds, :]

### Now filter the tags based on whether they were used to describe the popular movies identified above.

In [198]:
genome_map = {tag.tagId: tag.tag for _, tag in genome_tags.iterrows()}
genome_scores.insert(2, 'tag', genome_scores.tagId.map(genome_map))
genome_scores.insert(1, 'imdbId', genome_scores.movieId.map(movie_map))
score_inds = genome_scores.imdbId.isin(good_movie_imdb_ids)
good_genome_scores = genome_scores.loc[score_inds, :]

### Example code for converting datetimes.

In [None]:
from datetime import datetime

timestamp = 1256677221
dt_object = datetime.fromtimestamp(timestamp)
dt_object.date()