### import libraries

In [1]:
import json
import pandas as pd
import numpy as np

### load data

In [2]:
movie_info_list=json.load(open( 'movie_info.json','r'))    

In [3]:
movie_info_list[0]

{'title': 'Snow White and the Seven Dwarfs',
 'Directed by': ['David Hand',
  'Perce Pearce',
  'William Cottrell',
  'Larry Morey',
  'Wilfred Jackson',
  'Ben Sharpsteen'],
 'Story by': ['Ted Sears',
  'Richard Creedon',
  'Otto Englander',
  'Dick Rickard',
  'Earl Hurd',
  'Merrill De Maris',
  'Dorothy Ann Blank',
  'Webb Smith'],
 'Based on': ['"', 'Snow White', '"', 'by the', 'Brothers Grimm'],
 'Produced by': 'Walt Disney',
 'Starring': ['Adriana Caselotti',
  'Roy Atwell',
  'Pinto Colvig',
  'Otis Harlan',
  'Scotty Mattraw',
  'Billy Gilbert',
  'Eddie Collins'],
 'Music by': ['Frank Churchill', 'Leigh Harline', 'Paul Smith'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )',
  'February 4, 1938 (United States)'],
 'Running time': '83 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$1.5 million',
 'Box office': '$418 million'}

### convert runnung time into integer

In [4]:
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else:
        return int(running_time.split(" ")[0])


In [5]:
for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))

In [6]:
print([movie.get('Running time (int)') for movie in movie_info_list])


[83, 88, 126, 74, 64, 70, 42, 65, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 93, 76, 75, 73, 85, 81, 70, 90, 80, 75, 84, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 73, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 131, 92, 87, 116, 93, 114, 110, 131, 101, 110, 84, 78, 75, 164, 106, 110, 99, 113, 108, 102, 85, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, None, 91, 112, 115, 95, 91, 97, 104, 74, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 112, 84, 97, 97, 114, 96, 97, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 90, 74, 90, 89, 110, 74, 93, 84, 83, 69, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 109, 94, 106, 102, 69, 88, 102, 102, 97, 111, 92, 100, 96, 96, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 81, 101, 104, 103, 86, 106, 74, 93, 92, 98, 76, 95, 72, 93, 87, 70, 93, 87, 12

### convert budget and box office to numbers

In [7]:
for movie in movie_info_list:
    print(movie.get('Budget'))

$1.5 million
$2.6 million
$2.28 million
$600,000
$950,000
$858,000
None
$788,000
None
$1.35 million
$2.125 million
None
$1.5 million
$1.5 million
None
$2.2 million
$1.8 million
$3 million
None
$4 million
$2 million
$300,000
$1.8 million
None
$5 million
None
$4 million
None
None
None
None
None
None
$700,000
None
None
None
None
None
$6 million
under $1 million or $1,250,000
None
$2 million
None
None
$2.5 million
None
None
$4 million
$3.6–4 million
None
None
None
None
$3 million
None
$3 million
None
None
None
None
None
None
None
None
None
$3 million
None
None
None
None
$4.4–6 million
None
None
None
None
None
None
None
None
None
None
None
$4 million
None
$5 million
None
None
None
None
$5 million
None
None
None
None
None
None
$4 million
None
None
None
$6.3 million
None
None
None
None
None
None
None
None
$5 million
None
None
None
None
$8 million
None
None
None
None
None
None
AU$1 million
None
None
None
None
$5 million
None
None
$7.5 million
None
$10 million
None
None
$3.5 to 4 million
None
N

In [8]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [9]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [10]:
money_conversion(str(movie_info_list[0]["Budget"]))

1500000.0

### convert dates into Datetime object

In [11]:
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

['N/A', 'N/A', ['November 13, 1940'], ['June 27, 1941'], 'N/A', 'N/A', 'N/A', ['July 17, 1943'], 'N/A', 'N/A', 'N/A', ['September 27, 1947'], 'May 27, 1948', 'N/A', ['October 5, 1949'], 'N/A', 'N/A', 'N/A', 'N/A', ['February 5, 1953'], ['July 23, 1953 (United States)'], ['November 10, 1953'], 'N/A', ['August 17, 1954'], ['December 23, 1954'], 'May 25, 1955', ['June 22, 1955'], ['September 14, 1955'], 'December 22, 1955', 'June 8, 1956', ['July 18, 1956'], ['September 4, 1956'], ['December 20, 1956'], 'June 19, 1957', 'August 28, 1957', ['December 25, 1957'], ['July 8, 1958'], ['August 12, 1958'], ['December 25, 1958'], ['January 29, 1959'], ['March 19, 1959'], 'N/A', ['November 10, 1959'], 'January 21, 1960 ( Sarasota, FL )', ['February 24, 1960'], 'May 19, 1960', 'N/A', ['November 1, 1960'], ['December 21, 1960'], ['January 25, 1961'], 'March 16, 1961', ['June 21, 1961'], ['July 12, 1961'], ['July 17, 1961'], ['December 14, 1961'], 'April 5, 1962', ['May 17, 1962'], ['June 6, 1962'], 

In [12]:
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None


In [13]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [14]:
movie_info_list[50]

{'title': 'The Absent-Minded Professor',
 'Directed by': 'Robert Stevenson',
 'Screenplay by': 'Bill Walsh',
 'Based on': ['"A Situation of Gravity"', 'by', 'Samuel W. Taylor'],
 'Produced by': ['Walt Disney'],
 'Starring': ['Fred MacMurray',
  'Nancy Olson',
  'Keenan Wynn',
  'Tommy Kirk',
  'Leon Ames',
  'Elliott Reid',
  'Edward Andrews',
  'Wally Brown',
  'Alan Carney',
  'Forrest Lewis',
  'James Westerfield',
  'Ed Wynn'],
 'Cinematography': 'Edward Colman',
 'Edited by': 'Cotton Warburton',
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'March 16, 1961',
 'Running time': '97 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$25.3 million',
 'Running time (int)': 97,
 'Budget (float)': None,
 'Box office (float)': 25300000.0,
 'Release date (datetime)': datetime.datetime(1961, 3, 16, 0, 0)}

In [15]:
import pickle


In [17]:
with open('data.pkl','wb') as f:
    pickle.dump(movie_info_list,f)