# Kaggle - TMDB 

### Extract tables

**Here I will extract data from some of the fields in the raw data (e.g. average revenue by actor) and store it in `.csv` files**

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

In [2]:
MILLION = 1000000

In [3]:
train = pd.read_csv('train.csv')

In [5]:
train.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

### belong_to_collection

In [26]:
train[train['belongs_to_collection'].notna()]['belongs_to_collection'].map(lambda x: len(ast.literal_eval(x))).unique()

array([1])

All lists have 1 element

In [29]:
dicts = train[train['belongs_to_collection'].notna()]['belongs_to_collection'].map(
    lambda x: ast.literal_eval(x)[0]).values

In [31]:
dicts[0]

{'id': 313576,
 'name': 'Hot Tub Time Machine Collection',
 'poster_path': '/iEhb00TGPucF0b4joM1ieyY026U.jpg',
 'backdrop_path': '/noeTVcgpBiD48fDjFVic1Vz7ope.jpg'}

In [34]:
# get all collections
collections = [d['id'] for d in dicts]

In [39]:
uniq,counts = np.unique(np.array(collections),return_counts=True)

In [40]:
counts

array([ 2,  3,  2,  3,  2,  3,  2,  3,  1,  1,  2,  2,  1,  2,  1, 16,  2,
        1,  2,  6,  1,  2,  1,  4,  2,  3,  2,  1,  3,  2,  1,  2,  1,  1,
        2,  2,  1,  1,  2,  3,  1,  3,  4,  1,  2,  4,  4,  2,  1,  4,  2,
        2,  1,  2,  1,  1,  5,  1,  4,  2,  1,  3,  7,  1,  1,  2,  1,  1,
        4,  2,  1,  1,  1,  1,  2,  1,  1,  1,  2,  1,  2,  1,  1,  1,  4,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,  2,  2,  5,  1,
        4,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  1,  1,  1,  1,  1,
        2,  2,  2,  1,  2,  1,  2,  2,  1,  1,  2,  1,  3,  1,  1,  1,  2,
        3,  1,  1,  2,  1,  1,  2,  1,  1,  1,  1,  1,  1,  3,  1,  2,  1,
        1,  1,  1,  1,  2,  1,  3,  3,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,  1,
        1,  1,  1,  1,  2,  1,  2,  2,  1,  2,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  2,  1,  2,  1,  1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  1,
        1,  1,  2,  2,  1

In [43]:
counts[9], uniq[9]

(1, 328)

In [44]:
[d for d in dicts if d['id']==328]

[{'id': 328,
  'name': 'Jurassic Park Collection',
  'poster_path': '/qIm2nHXLpBBdMxi8dvfrnDkBUDh.jpg',
  'backdrop_path': '/pJjIH9QN0OkHFV9eue6XfRVnPkr.jpg'}]

Some collections seem to only have 1 item in them. Perhaps the other items are in the test set?

In [45]:
test = pd.read_csv('test.csv')

In [62]:
test[test['belongs_to_collection'].fillna('[{"id":-1}]').map(lambda x: (ast.literal_eval(x))[0]['id'])==328]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew
883,3884,"[{'id': 328, 'name': 'Jurassic Park Collection...",73000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",,tt0119567,en,The Lost World: Jurassic Park,Four years after Jurassic Park's genetically b...,0.788123,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",5/23/97,129.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Something has survived.,The Lost World: Jurassic Park,"[{'id': 911, 'name': 'exotic island'}, {'id': ...","[{'cast_id': 1, 'character': 'Dr. Ian Malcolm'...","[{'credit_id': '52fe4238c3a36847f800d3ad', 'de..."
1934,4935,"[{'id': 328, 'name': 'Jurassic Park Collection...",150000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.jurassicworld.com/,tt0369610,en,Jurassic World,Twenty-two years after the events of Jurassic ...,32.790475,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",6/9/15,124.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The park is open.,Jurassic World,"[{'id': 1299, 'name': 'monster'}, {'id': 1718,...",,
4213,7214,"[{'id': 328, 'name': 'Jurassic Park Collection...",93000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",,tt0163025,en,Jurassic Park III,"In need of funds for research, Dr. Alan Grant ...",0.648867,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",7/18/01,92.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"This time, it's not just a walk in the park!",Jurassic Park III,"[{'id': 911, 'name': 'exotic island'}, {'id': ...","[{'cast_id': 1, 'character': 'Dr. Alan Grant',...","[{'credit_id': '52fe4238c3a36847f800d479', 'de..."


Indeed

**I will include collections with only 1 item as well**

In [63]:
# get all revenues for each collection

In [65]:
revenues = []
for collection in uniq:
    # select all movies from this collections
    revenues.append(train[train['belongs_to_collection'].fillna('[{"id":-1}]').map(
        lambda x: (ast.literal_eval(x))[0]['id'])==collection]['revenue'].mean())

check one at random to make sure it was correctly computed

In [67]:
revenues

[749699164.0,
 531269279.6666667,
 898827882.0,
 96733333.33333333,
 190916236.5,
 821238738.6666666,
 288263791.5,
 887241675.3333334,
 311312624.0,
 920100000.0,
 91684565.5,
 584229185.0,
 107196498.0,
 256685600.0,
 192452832.0,
 302243560.6875,
 120031051.5,
 128905366.0,
 433535552.0,
 20264829.0,
 120207127.0,
 905323839.5,
 82719885.0,
 176840916.25,
 196344294.0,
 12514771.0,
 17661522.0,
 38610009.0,
 203507356.0,
 67272030.0,
 161834276.0,
 261518299.5,
 180949000.0,
 215394738.0,
 35431358.5,
 159801464.5,
 294456605.0,
 25800.0,
 82794123.0,
 165873064.33333334,
 40400657.0,
 13579416.666666666,
 181968309.25,
 242688965.0,
 26178170.0,
 152505450.25,
 584865942.75,
 290769949.0,
 115103979.0,
 818674036.75,
 104072781.5,
 14604980.5,
 211952420.0,
 201729586.0,
 153698625.0,
 239606210.0,
 31663618.8,
 254134910.0,
 558791460.5,
 64548285.5,
 296000000.0,
 41877315.666666664,
 41211249.85714286,
 351692268.0,
 330579719.0,
 66269831.5,
 373554033.0,
 40996665.0,
 37798098

In [75]:
uniq[0], uniq[77],uniq[-1]

(10, 12087, 479888)

In [76]:
train[train['belongs_to_collection'].fillna('[{"id":-1}]').map(
        lambda x: (ast.literal_eval(x))[0]['id'])==10]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
689,690,"[{'id': 10, 'name': 'Star Wars Collection', 'p...",113000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://www.starwars.com/films/star-wars-episod...,tt0121766,en,Star Wars: Episode III - Revenge of the Sith,"Years after the onset of the Clone Wars, the n...",13.165421,...,5/17/05,140.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The saga is complete.,Star Wars: Episode III - Revenge of the Sith,"[{'id': 797, 'name': 'showdown'}, {'id': 10013...","[{'cast_id': 13, 'character': 'Obi-Wan Kenobi'...","[{'credit_id': '52fe431fc3a36847f803bea3', 'de...",850000000
1818,1819,"[{'id': 10, 'name': 'Star Wars Collection', 'p...",120000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,tt0121765,en,Star Wars: Episode II - Attack of the Clones,"Ten years after the invasion of Naboo, the gal...",14.072511,...,5/15/02,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A Jedi Shall Not Know Anger. Nor Hatred. Nor L...,Star Wars: Episode II - Attack of the Clones,"[{'id': 1399, 'name': 'senate'}, {'id': 5340, ...","[{'cast_id': 7, 'character': 'Obi Wan Kenobi',...","[{'credit_id': '52fe431fc3a36847f803bd8b', 'de...",649398328


In [80]:
(850000000 + 649398328)/2 / MILLION

749.699164

In [81]:
revenues[0] / MILLION

749.699164

In [79]:
train[train['belongs_to_collection'].fillna('[{"id":-1}]').map(
        lambda x: (ast.literal_eval(x))[0]['id'])==12087]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
1862,1863,"[{'id': 12087, 'name': 'Herbie Collection', 'p...",50000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",,tt0400497,en,Herbie Fully Loaded,"Maggie Peyton, the new owner of Number 53 - th...",9.35239,...,6/22/05,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Start your engines...,Herbie Fully Loaded,"[{'id': 830, 'name': 'car race'}, {'id': 6067,...","[{'cast_id': 1, 'character': 'Maggie Peyton', ...","[{'credit_id': '59620742925141790403a4fe', 'de...",66002004


In [82]:
revenues[77]

66002004.0

In [83]:
train[train['belongs_to_collection'].fillna('[{"id":-1}]').map(
        lambda x: (ast.literal_eval(x))[0]['id'])==479888]

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
2891,2892,"[{'id': 479888, 'name': 'The Thing Collection'...",35000000,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",http://www.uphe.com/movies/the-thing-2011,tt0905372,en,The Thing,When paleontologist Kate Lloyd travels to an i...,10.169411,...,10/12/11,103.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,It's Not Human. Yet.,The Thing,"[{'id': 2340, 'name': 'paranoia'}, {'id': 4713...","[{'cast_id': 2, 'character': 'Kate Lloyd', 'cr...","[{'credit_id': '537b713fc3a3682d3c00000e', 'de...",28128670


In [84]:
revenues[-1]

28128670.0

In [120]:
pd.DataFrame(revenues,index=uniq,columns=['average_revenue'],dtype=int).to_csv('collections.csv')