# Cleaning the data + feature engineering

## Imports

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

In [2]:
os.getcwd()

'/Users/ryanrappa/Desktop/dsi/film-profit-prediction/jupyter_notebooks'

In [3]:
info_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/info_10k.csv'  #redacted
credits_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/credits_10k.csv'  #redacted
releases_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/releases_10k.csv'  #redacted

In [4]:
info_df = pd.read_csv(info_file)
credits_df = pd.read_csv(credits_file)
releases_df = pd.read_csv(releases_file)

## Cleaning + feature engineering

### Because of the way data was collected, the last row contains the headers. Fixing that:

In [5]:
info_df.columns = info_df.iloc[-1:, :].squeeze()
info_df = info_df.iloc[:-1, :]

In [6]:
credits_df.columns = credits_df.iloc[-1:, :].squeeze()
credits_df = credits_df.iloc[:-1, :]

In [7]:
releases_df.columns = releases_df.iloc[-1:, :].squeeze()
releases_df = releases_df.iloc[:-1, :]

### Appending columns with top billed actors to the credits dataframe:

In [40]:
def get_cast_member(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    
    actor (str): name of the actor corresponding to the billing 
    number in this function (lower billing number = more prominent
    role; 0 = star of the movie, 1 = costar, etc.)
    '''
    lst_of_dicts = eval(lod_str)
    billing = 5  #increment this line manually and rerun apply method for desired # of actors
    if len(lst_of_dicts) > billing:
        dict_of_interest = lst_of_dicts[billing]
        actor = dict_of_interest['name']
        return actor
    else:
        pass

In [41]:
credits_df['cast5'] = credits_df['cast'].apply(get_cast_member)

### Appending columns with director and screenwriters to the credits dataframe:

In [65]:
def get_director(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    
    (str): name of the first director from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            if lst_of_dicts[i]['job'] == 'Director':
                return lst_of_dicts[i]['name']
    else:
        pass

In [66]:
credits_df['director'] = credits_df['crew'].apply(get_director)

In [85]:
def get_first_writer(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    
    (str): name of the first screenwriter from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            if lst_of_dicts[i]['job'] == 'Screenplay':
                return lst_of_dicts[i]['name']
    else:
        pass

In [94]:
def get_second_writer(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    
    (str): name of the second screenwriter from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        count = 0
        for i in range(len(lst_of_dicts)):
            if lst_of_dicts[i]['job'] == 'Screenplay' and count == 0:
                count += 1
            elif lst_of_dicts[i]['job'] == 'Screenplay' and count == 1:
                return lst_of_dicts[i]['name']
    else:
        pass

In [86]:
credits_df['writer0'] = credits_df['crew'].apply(get_first_writer)

In [95]:
credits_df['writer1'] = credits_df['crew'].apply(get_second_writer)

In [99]:
credits_df.head()

4933,id,cast,crew,cast0,cast1,cast2,cast3,cast4,cast5,director,writer0,writer1
0,3,"[{'cast_id': 5, 'character': 'Nikander', 'cred...","[{'credit_id': '52fe420dc3a36847f8000071', 'de...",Matti Pellonpää,Kati Outinen,Sakari Kuosmanen,Esko Nikkari,Kylli Köngäs,Pekka Laiho,Aki Kaurismäki,Aki Kaurismäki,
1,5,"[{'cast_id': 42, 'character': 'Ted the Bellhop...","[{'credit_id': '52fe420dc3a36847f800011b', 'de...",Tim Roth,Antonio Banderas,Jennifer Beals,Madonna,Marisa Tomei,Bruce Willis,Allison Anders,,
2,6,"[{'cast_id': 7, 'character': 'Frank Wyatt', 'c...","[{'credit_id': '52fe420dc3a36847f800023d', 'de...",Emilio Estevez,Cuba Gooding Jr.,Denis Leary,Jeremy Piven,Peter Greene,Michael DeLorenzo,Stephen Hopkins,Lewis Colick,
3,8,[],"[{'credit_id': '52fe420dc3a36847f80002b5', 'de...",,,,,,,Timo Novotny,Michael Glawogger,Timo Novotny
4,9,"[{'cast_id': 7, 'character': 'Alexandra', 'cre...","[{'credit_id': '52fe420dc3a36847f8000311', 'de...",Rita Lengyel,Milton Welsh,,,,,Marc Meyer,,


In [98]:
credits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4933 entries, 0 to 4932
Data columns (total 12 columns):
id          4933 non-null object
cast        4933 non-null object
crew        4933 non-null object
cast0       4884 non-null object
cast1       4844 non-null object
cast2       4802 non-null object
cast3       4770 non-null object
cast4       4703 non-null object
cast5       4606 non-null object
director    4908 non-null object
writer0     3701 non-null object
writer1     1715 non-null object
dtypes: object(12)
memory usage: 462.5+ KB


### Next step...

#### Seeing what some of the data looks like:

In [78]:
crew_lod = eval(credits_df.iat[250, 2])

In [79]:
for i in range(len(crew_lod)):
    print(crew_lod[i]['job'])
    print(crew_lod[i]['name'])
    print("")

Director
Lewis Milestone

Producer
Lewis Milestone

Screenplay
Harry Brown

Screenplay
Charles Lederer

Director of Photography
William H. Daniels

Editor
Philip W. Anderson

Original Music Composer
Nelson Riddle

Associate Producer
Henry W. Sanicola

Art Direction
Nicolai Remisoff

Set Decoration
Howard Bristol

Costume Design
Howard Shoup

Story
George Clayton Johnson

Story
Jack Golden Russell

Sound
M.A. Merrick

Producer's Assistant
Richard Benedict

Production Manager
Jack R. Berne

Makeup Supervisor
Gordon Bau

Assistant Director
Ray Gosnell Jr.

Title Designer
Saul Bass

Orchestrator
Arthur Morton

Conductor
Nelson Riddle

Songs
Sammy Cahn

Songs
Jimmy Van Heusen

Camera Operator
A. Lindsley Lane

