## Import Libraries
Import the necessary Python libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import ast

In [2]:
%matplotlib inline
sns.set_style('whitegrid')

## Load Data
Load movies csv file and perform two tasks while loading:

Convert date field to datetime.date type Convert all column with json data as json type

In [3]:
def load_movies_metadata(file_path):
    df = pd.read_csv(file_path, dtype='unicode')
    # covert each item of release_date to datetime.date type entity
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: x.date())
    # all json columns`
    json_columns = ['belongs_to_collection', 'genres', 'production_companies', 'production_countries', 'spoken_languages']
    for column in json_columns:
        # use ast because json data has single quotes in the csv, which is invalid for a json object; it should be " normally
        df[column] = df[column].apply(lambda x: np.nan if pd.isnull(x) else ast.literal_eval(x))
    return df

Load the movies metadata csv file

In [4]:
movies = load_movies_metadata(r"D:\uChicago\Classes\Q2\Data Mining\project\the-movies-dataset\movies_metadata.csv")

In [7]:
movies['belongs_to_collection'][0]

{'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg',
 'id': 10194,
 'name': 'Toy Story Collection',
 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg'}

In [8]:
movies['genres'][0]

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [9]:
movies['belongs_to_collection'][0]

{'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg',
 'id': 10194,
 'name': 'Toy Story Collection',
 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg'}

In [10]:
movies['production_countries'][0]

[{'iso_3166_1': 'US', 'name': 'United States of America'}]

In [11]:
movies['belongs_to_collection'][0]

{'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg',
 'id': 10194,
 'name': 'Toy Story Collection',
 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg'}

In [12]:
movies['spoken_languages'][0]

[{'iso_639_1': 'en', 'name': 'English'}]

Below code prints unique list of languages in the dataset

In [5]:
langs = []

for i,row in movies.iterrows():
    if row['spoken_languages'] is not np.nan:
        a = row['spoken_languages']
        for item in a:
            if item['name'] not in langs:
                langs.append(item['name'])

print(langs)

['English', 'Français', 'Español', 'Deutsch', 'Pусский', 'Latin', 'Nederlands', '广州话 / 廣州話', '普通话', 'Magyar', 'shqip', 'Italiano', '한국어/조선말', 'فارسی', 'Dansk', '', '日本語', 'العربية', 'Hrvatski', 'Bosanski', 'Română', 'Bahasa indonesia', 'Bahasa melayu', 'svenska', 'עִבְרִית', 'Český', 'Polski', 'Gaeilge', 'Norsk', 'Slovenčina', 'Tiếng Việt', 'Português', 'हिन्दी', 'Català', 'Íslenska', 'Afrikaans', 'Srpski', 'বাংলা', 'Wolof', 'Cymraeg', 'ภาษาไทย', 'Latviešu', 'Kiswahili', 'български език', 'ελληνικά', 'Türkçe', 'suomi', 'Esperanto', 'Український', 'ქართული', 'Bokmål', 'No Language', 'euskera', 'Azərbaycan', 'Malti', 'اردو', 'isiZulu', 'Bamanankan', 'پښتو', 'Somali', 'ਪੰਜਾਬੀ', 'беларуская мова', 'தமிழ்', 'Galego', 'Kinyarwanda', 'қазақ', 'Eesti', 'Lietuvi\x9akai', 'Slovenščina', 'తెలుగు', 'Fulfulde', '??????', '?????', 'ozbek', 'Hausa']


Below rows had to be delted because of the incorrect data placement in these rows

In [None]:
movies.drop(movies.index[19730],inplace=True)
movies.drop(movies.index[29502],inplace=True)
movies.drop(movies.index[35585],inplace=True)

Below code prints unique list of countries in the dataset

In [24]:
countries = []

for i,row in movies.iterrows():
    if row['production_countries'] is not np.nan:
        a = row['production_countries']
        for item in a:
            if item['name'] not in countries:
                countries.append(item['name'])

print(countries)

['United States of America', 'Germany', 'United Kingdom', 'France', 'Italy', 'Spain', 'China', 'Australia', 'South Africa', 'Canada', 'Switzerland', 'Belgium', 'Japan', 'Iran', 'Netherlands', 'Hong Kong', 'Tunisia', 'Ireland', 'Dominican Republic', 'Croatia', 'Russia', 'Macedonia', 'Austria', 'Taiwan', 'New Zealand', 'Mexico', 'Poland', 'Peru', 'Cuba', 'Liechtenstein', 'Denmark', 'Portugal', 'Finland', 'Sweden', 'Argentina', 'Iceland', 'South Korea', 'Serbia', 'Hungary', 'Czech Republic', 'India', 'Brazil', 'Greece', 'Congo', 'Senegal', 'Burkina Faso', 'Romania', 'Philippines', 'Vietnam', 'Trinidad and Tobago', 'Bulgaria', 'Chile', 'Norway', 'Kazakhstan', 'Algeria', 'Luxembourg', 'Georgia', 'Ukraine', 'Botswana', 'Aruba', 'Israel', 'Turkey', 'Ecuador', 'Lebanon', 'Morocco', 'Bosnia and Herzegovina', 'Bahamas', 'Malaysia', 'Bhutan', 'Jamaica', 'Pakistan', 'Nepal', "Cote D'Ivoire", 'Thailand', 'Namibia', 'Cameroon', 'Colombia', 'Czechoslovakia', 'Uruguay', 'Slovenia', 'Libyan Arab Jamahi

Below code prints unique list of genres in the dataset

In [26]:
genres = []

for i,row in movies.iterrows():
    if row['genres'] is not np.nan:
        a = row['genres']
        for item in a:
            if item['name'] not in genres:
                genres.append(item['name'])

print(genres)

['Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'History', 'Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary', 'Western', 'TV Movie']


In [10]:
movies.drop(movies.index[19730],inplace=True)

In [17]:
movies.drop(movies.index[29502],inplace=True)

In [22]:
movies.drop(movies.index[35585],inplace=True)

In [21]:
movies.iloc[35585]

adult                     Avalanche Sharks tells the story of a bikini ...
belongs_to_collection                                              2.18548
budget                                    /zaSf5OG7V8X8gqFvly88zDdRm46.jpg
genres                   [{'name': 'Odyssey Media', 'id': 17161}, {'nam...
homepage                          [{'iso_3166_1': 'CA', 'name': 'Canada'}]
id                                                              2014-01-01
imdb_id                                                                  0
original_language                                                     82.0
original_title                    [{'iso_639_1': 'en', 'name': 'English'}]
overview                                                          Released
popularity                                           Beware Of Frost Bites
poster_path                                               Avalanche Sharks
production_companies                                                 False
production_countries     