# Cleaning + feature engineering

## Imports

In [131]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

In [2]:
os.getcwd();

In [3]:
info_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/info_10k.csv'  #redacted
credits_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/credits_10k.csv'  #redacted
releases_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/releases_10k.csv'  #redacted

In [4]:
#info table contains basic film details like genre, budget, revenue, language, runtime, etc.
info_df = pd.read_csv(info_file)

#credits table contains cast and crew
credits_df = pd.read_csv(credits_file)

#releases table contains info on which countries each film was released in
releases_df = pd.read_csv(releases_file)

## Cleaning + new features

#### Because of the way data was collected, last row of each table has the headers. Fixing that:

In [5]:
info_df.columns = info_df.iloc[-1:, :].squeeze()
info_df = info_df.iloc[:-1, :]

In [6]:
credits_df.columns = credits_df.iloc[-1:, :].squeeze()
credits_df = credits_df.iloc[:-1, :]

In [7]:
releases_df.columns = releases_df.iloc[-1:, :].squeeze()
releases_df = releases_df.iloc[:-1, :]

### Cleaning info table

#### Getting each film's genre:

In [8]:
def get_first_genre(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (str): name of the first genre from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            return lst_of_dicts[i]['name']
    else:
        pass

In [9]:
info_df['genre0'] = info_df['genres'].apply(get_first_genre)  #func in hidden cell above

#### Dropping unnecessary rows and columns from info table:

In [10]:
#making sure relevant numerical columns are int/float type
info_df = info_df.astype({"budget": float, "id": int, "revenue": float, "runtime": float})

In [11]:
#filtering for only non-adult films,
# with revenue & budget > 0,
# original language english,
# runtime >= 80 minutes,
# and genre is not documentary.

info_df = info_df[(info_df['adult'] == 'False') & \
        (info_df['budget'] > 0) & \
        (info_df['revenue'] > 0) & \
        (info_df['original_language'] == 'en') & \
        (info_df['runtime'] >= 80.0) & \
        (info_df['genre0'] != 'Documentary')]

In [12]:
#dropping columns from info_df that won't be needed for EDA
drop_cols = ['adult', 'backdrop_path', 'belongs_to_collection', 
             'genres', 'homepage', 'imdb_id', 'original_language', 'original_title', 
             'overview', 'popularity', 'poster_path', 'production_companies', 
             'production_countries', 'spoken_languages', 'status', 'tagline', 'video', 
             'vote_average', 'vote_count']

info_df = info_df.drop(drop_cols, axis=1)

### a few more cols to be dropped later for modelling, after EDA, feat. engineering, & joins: 
### ['id', 'release_date', 'runtime' (possibly), 'title', 'genre' (possibly)]

#### Creating columns for (a) amount of profit or loss (revenue minus budget) and (b) whether each movie profited (binary, whether revenue > budget):

In [13]:
info_df['profit'] = info_df['revenue'] - info_df['budget']

In [14]:
info_df['made_money'] = info_df['profit'] > 0
info_df['made_money'] = info_df['made_money'].astype('int')

In [56]:
# print(info_df.info())
info_df.head()

4962,budget,id,release_date,revenue,runtime,title,genre0,profit,made_money
1,4000000.0,5,1995-12-09,4300000.0,98.0,Four Rooms,Crime,300000.0,1
2,21.0,6,1993-10-15,12.0,110.0,Judgment Night,Action,-9.0,0
5,11000000.0,11,1977-05-25,775398007.0,121.0,Star Wars,Adventure,764398007.0,1
6,94000000.0,12,2003-05-30,940335536.0,100.0,Finding Nemo,Animation,846335536.0,1
7,55000000.0,13,1994-07-06,677945399.0,142.0,Forrest Gump,Comedy,622945399.0,1


### Cleaning credits table

#### Appending columns with top billed actors to the credits table:

In [24]:
def get_cast_member(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    actor (str): name of the actor corresponding to the billing 
    number in this function (lower billing number = more prominent
    role; 0 = star of the movie, 1 = costar, etc.)
    '''
    lst_of_dicts = eval(lod_str)
    billing = 4  #increment this line manually and rerun for desired # of actors, starting at 0
    if len(lst_of_dicts) > billing:
        dict_of_interest = lst_of_dicts[billing]
        actor = dict_of_interest['name']
        return actor
    else:
        pass

In [25]:
# DO NOT FORGET to manually increment the new col name here, e.g. 'cast0', 'cast1', etc.
credits_df['cast4'] = credits_df['cast'].apply(get_cast_member)

#### Appending columns with director and screenwriters to the credits table:

In [26]:
def get_director(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (str): name of the first director from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            if lst_of_dicts[i]['job'] == 'Director':
                return lst_of_dicts[i]['name']
    else:
        pass

In [27]:
credits_df['director0'] = credits_df['crew'].apply(get_director)  #func in hidden cell above

In [28]:
def get_first_writer(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (str): name of the first screenwriter from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            if lst_of_dicts[i]['job'] == 'Screenplay' or \
            lst_of_dicts[i]['job'] == 'Writer' or \
            lst_of_dicts[i]['job'] == 'Author':
                return lst_of_dicts[i]['name']
    else:
        pass

In [29]:
credits_df['writer0'] = credits_df['crew'].apply(get_first_writer)  #func in hidden cell above

In [41]:
creds_drop_cols = ['cast', 'crew']
credits_df = credits_df.drop(creds_drop_cols, axis=1)

In [99]:
#making sure id column is int so dataframes can be merged later
credits_df = credits_df.astype({"id": int})

In [None]:
# def get_second_writer(lod_str):
#     '''
#     Function for use with pandas apply method.
#     --------
#     INPUT
#     lod_str (str): a list of dictionaries in the form of a string
#     --------
#     OUTPUT
#     (str): name of the second screenwriter from the list of dicts
#     '''
#     lst_of_dicts = eval(lod_str)
#     if len(lst_of_dicts) > 0:
#         count = 0
#         for i in range(len(lst_of_dicts)):
#             if lst_of_dicts[i]['job'] == 'Screenplay' and count == 0:
#                 count += 1
#             elif lst_of_dicts[i]['job'] == 'Screenplay' and count == 1:
#                 return lst_of_dicts[i]['name']
#     else:
#         pass

In [None]:
# credits_df['writer1'] = credits_df['crew'].apply(get_second_writer)
### decided not to get second writer becuse relatively few films have one in the data

#### Dropping rows without at least 5 actors, or missing director, or missing writer:

In [47]:
credits_df = credits_df.dropna(subset=['cast4', 'director0', 'writer0'])

### Cleaning releases table

#### Getting number of countries each film was released in and dropping nulls:

In [75]:
def get_releases_count(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (int): number of countries/territories the film was released in
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        return len(lst_of_dicts)
    else:
        pass

In [76]:
releases_df['releases'] = releases_df['countries'].apply(get_releases_count)  #func in hidden cell above

In [80]:
releases_df = releases_df.dropna()

In [86]:
releases_df = releases_df.drop('countries', axis=1)

In [100]:
#making sure id column is int so dataframes can be merged later
releases_df = releases_df.astype({"id": int})

### Joining the tables

#### Note: when the keys to join on are pd columns, not indices, better to use merge method

In [101]:
step1 = info_df.merge(credits_df, on='id', how='inner')

In [148]:
df = step1.merge(releases_df, on='id', how='inner')

### Checking for outliers/faulty data (HAVE TO REEXAMINE THIS ON FULL DATA)

In [129]:
pd.set_option('float_format', '{:f}'.format)  #so that floats will print w/o scientific notat.

In [149]:
df.describe()  #checking for outliers/faulty data, looks like we have an issue in budget/revenue

Unnamed: 0,budget,id,revenue,runtime,profit,made_money,releases
count,1546.0,1546.0,1546.0,1546.0,1546.0,1546.0,1546.0
mean,38622310.793661,4280.022639,128294032.535576,114.152005,89671721.741915,0.801423,17.402975
std,40247799.140231,3758.514757,173168777.950504,21.962564,150251482.734078,0.399058,17.390917
min,1.0,5.0,5.0,80.0,-98301101.0,0.0,1.0
25%,10000000.0,790.25,22000000.0,99.0,3019205.0,1.0,6.0
50%,26000000.0,2469.5,65730000.0,110.0,36760682.0,1.0,10.0
75%,55000000.0,8871.75,163718070.25,124.0,115568858.5,1.0,19.0
max,380000000.0,9992.0,1845034188.0,248.0,1645034188.0,1.0,118.0


In [150]:
df = df[df['budget'] > 1000] #dropping rows with faulty budget numbers

In [166]:
df = df[df['revenue'] > 10300] #dropping rows with faulty revenue numbers

In [167]:
df.describe()  #much better (the 12k revenue and 12k budget are not mistakes)

Unnamed: 0,budget,id,revenue,runtime,profit,made_money,releases
count,1535.0,1535.0,1535.0,1535.0,1535.0,1535.0,1535.0
mean,38841753.859935,4281.883388,129138410.224104,114.202606,90296656.364169,0.803909,17.463844
std,40272976.340912,3758.806706,173477195.271823,21.999348,150567591.003754,0.397168,17.426354
min,12000.0,5.0,12000.0,80.0,-98301101.0,0.0,1.0
25%,10000000.0,791.0,22430661.5,99.0,3684759.5,1.0,6.0
50%,26000000.0,2486.0,66000000.0,110.0,37410000.0,1.0,10.0
75%,55000000.0,8873.0,163938407.5,124.0,118302504.5,1.0,19.0
max,380000000.0,9992.0,1845034188.0,248.0,1645034188.0,1.0,118.0


In [169]:
df.head()

Unnamed: 0,budget,id,release_date,revenue,runtime,title,genre0,profit,made_money,cast0,cast1,cast2,cast3,cast4,director,writer0,releases
0,4000000.0,5,1995-12-09,4300000.0,98.0,Four Rooms,Crime,300000.0,1,Tim Roth,Antonio Banderas,Jennifer Beals,Madonna,Marisa Tomei,Allison Anders,Allison Anders,30.0
2,11000000.0,11,1977-05-25,775398007.0,121.0,Star Wars,Adventure,764398007.0,1,Mark Hamill,Harrison Ford,Carrie Fisher,Peter Mayhew,Anthony Daniels,George Lucas,George Lucas,67.0
3,94000000.0,12,2003-05-30,940335536.0,100.0,Finding Nemo,Animation,846335536.0,1,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe,Brad Garrett,Andrew Stanton,Andrew Stanton,62.0
4,55000000.0,13,1994-07-06,677945399.0,142.0,Forrest Gump,Comedy,622945399.0,1,Tom Hanks,Robin Wright,Gary Sinise,Mykelti Williamson,Sally Field,Robert Zemeckis,Eric Roth,39.0
5,15000000.0,14,1999-09-15,356296601.0,122.0,American Beauty,Drama,341296601.0,1,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley,Mena Suvari,Sam Mendes,Alan Ball,54.0


### Next step... get cast & crew lifetime revenue, profits, # of films, # of profitable films, see notes for other feature ideas

#### Sandbox/seeing what some of the data looks like:

In [None]:
exp_credits_df = pd.read_csv(credits_file)

In [None]:
# 109, 83, 82, 5, 17, 18, 21 
# Author, Writer

crew_lod = eval(exp_credits_df.iat[21, 2])

In [None]:
for i in range(len(crew_lod)):
    print(crew_lod[i]['job'])
    print(crew_lod[i]['name'])
    print("")

In [40]:
credits_df.iat[21, 9]

'Michael Herr'

In [117]:
exp_info_df = pd.read_csv(info_file)