In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json  # For posterity, pandas hates string literals for json and the feature will be removed in the future

# Set up notebook to show all outputs in a cell, not only last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Read the datasets
movie_csv = "tmdb_5000_movies.csv"
credits_csv = "tmdb_5000_credits.csv"
movies = pd.read_csv(movie_csv)
credits = pd.read_csv(credits_csv)

In [None]:
# Reading the Data
print(movies.head())
print()
print('-----o-----')
print()
print(movies.tail())
print()
print('-----o-----')
print()
print(movies.info())
print()
print()
print('=====o=====')
print()
print()
print(credits.head())
print()
print('-----o-----')
print()
print(credits.tail())
print()
print('-----o-----')
print()
print(credits.info())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

A lot of these columns are storing pure JSON. So we should be able to organize numerically even for many of our string values, so that's nice.

In [None]:
print(movies.loc[0])
print()
print('-----o-----')
print()
pure_json = json.loads(movies.loc[0].genres)   # We have to write it this way for posterity. The Pandas team is removing the ability to
json_table = pd.DataFrame(pure_json)           # pass string literals to read_json(), so this is the work-around.
print(json_table)                              # This WILL create a problem in the future if we do this for every index.

budget                                                          237000000
genres                  [{"id": 28, "name": "Action"}, {"id": 12, "nam...
homepage                                      http://www.avatarmovie.com/
id                                                                  19995
keywords                [{"id": 1463, "name": "culture clash"}, {"id":...
original_language                                                      en
original_title                                                     Avatar
overview                In the 22nd century, a paraplegic Marine is di...
popularity                                                     150.437577
production_companies    [{"name": "Ingenious Film Partners", "id": 289...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2009-12-10
revenue                                                        2787965087
runtime                               

## Validation
### Checking for Nulls

In [None]:
# Checking for nulls
movies.isnull().sum()
print()
print('=====o=====')
print()
credits.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64


=====o=====



movie_id    0
title       0
cast        0
crew        0
dtype: int64

There's no independent value here? We can't plot against time or anything, so we'll mostly be focused on data relationships.<br>
If we want to fill any of the missing values in the movies csv, we will have to fill them in ourselves.

### Data Relationships
From here we can start looking at data relationships, just trying things out.

In [None]:
print(movies.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


We can try and guess which columns relate to which, but it doesn't make much sense to organize anything until we know why we want to organize. If we wanted to see what makes some action movies more popular than other action movies for example, we could isolate all action movies into their own database and then look at the data from there. Essentially, divide and conquer.

## Misc Checking
This is for if I just want to mess with data but not necessarily save it.

In [None]:
# Max Values for numericals
print(movies.budget.max())
print(movies.id.max())
print(movies.popularity.max())
print(movies.revenue.max())
print(movies.runtime.max())
print(movies.vote_average.max())
print(movies.vote_count.max())

380000000
459488
875.581305
2787965087
338.0
10.0
13752


In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               