# Cleaning the data + feature engineering

## Imports

In [1]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

In [2]:
os.getcwd();

In [3]:
info_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/info_10k.csv'  #redacted
credits_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/credits_10k.csv'  #redacted
releases_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/releases_10k.csv'  #redacted

In [4]:
#info table contains basic film details like genre, budget, revenue, language, runtime, etc.
info_df = pd.read_csv(info_file)

#credits table contains cast and crew
credits_df = pd.read_csv(credits_file)

#releases table contains info on which countries each film was released in
releases_df = pd.read_csv(releases_file)

## Cleaning + adding new features

### Because of the way data was collected, the last row of each table contains the headers. Fixing that:

In [5]:
info_df.columns = info_df.iloc[-1:, :].squeeze()
info_df = info_df.iloc[:-1, :]

In [6]:
credits_df.columns = credits_df.iloc[-1:, :].squeeze()
credits_df = credits_df.iloc[:-1, :]

In [7]:
releases_df.columns = releases_df.iloc[-1:, :].squeeze()
releases_df = releases_df.iloc[:-1, :]

### Getting each film's genre:

In [8]:
def get_first_genre(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (str): name of the first genre from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            return lst_of_dicts[i]['name']
    else:
        pass

In [9]:
info_df['genre0'] = info_df['genres'].apply(get_first_genre)  #func in hidden cell above

### Dropping unnecessary rows and columns from info table:

In [10]:
#making sure relevant numrerical columns are int/float type
info_df = info_df.astype({"budget": float, "id": int, "revenue": float, "runtime": float})

In [11]:
#filtering for only non-adult films,
# with revenue & budget > 0,
# original language english,
# runtime >= 80 minutes,
# and genre is not documentary.

info_df = info_df[(info_df['adult'] == 'False') & \
        (info_df['budget'] > 0) & \
        (info_df['revenue'] > 0) & \
        (info_df['original_language'] == 'en') & \
        (info_df['runtime'] >= 80.0) & \
        (info_df['genre0'] != 'Documentary')]

In [12]:
#dropping columns from info_df that won't be needed for EDA
drop_cols = ['adult', 'backdrop_path', 'belongs_to_collection', 
             'genres', 'homepage', 'imdb_id', 'original_language', 'original_title', 
             'overview', 'popularity', 'poster_path', 'production_companies', 
             'production_countries', 'spoken_languages', 'status', 'tagline', 'video', 
             'vote_average', 'vote_count']

info_df = info_df.drop(drop_cols, axis=1)

### a few more cols to be dropped later for modelling, after EDA, feat. engineering, & joins: 
### ['id', 'release_date', 'runtime' (possibly), 'title', 'genre' (possibly)]

### Creating columns for (a) amount of profit or loss (revenue minus budget) and (b) whether each movie profited (binary, whether revenue > budget):

In [13]:
info_df['profit'] = info_df['revenue'] - info_df['budget']

In [14]:
info_df['made_money'] = info_df['profit'] > 0
info_df['made_money'] = info_df['made_money'].astype('int')

In [56]:
# print(info_df.info())
info_df.head()

4962,budget,id,release_date,revenue,runtime,title,genre0,profit,made_money
1,4000000.0,5,1995-12-09,4300000.0,98.0,Four Rooms,Crime,300000.0,1
2,21.0,6,1993-10-15,12.0,110.0,Judgment Night,Action,-9.0,0
5,11000000.0,11,1977-05-25,775398007.0,121.0,Star Wars,Adventure,764398007.0,1
6,94000000.0,12,2003-05-30,940335536.0,100.0,Finding Nemo,Animation,846335536.0,1
7,55000000.0,13,1994-07-06,677945399.0,142.0,Forrest Gump,Comedy,622945399.0,1


### Appending columns with top billed actors to the credits table:

In [24]:
def get_cast_member(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    actor (str): name of the actor corresponding to the billing 
    number in this function (lower billing number = more prominent
    role; 0 = star of the movie, 1 = costar, etc.)
    '''
    lst_of_dicts = eval(lod_str)
    billing = 4  #increment this line manually and rerun for desired # of actors, starting at 0
    if len(lst_of_dicts) > billing:
        dict_of_interest = lst_of_dicts[billing]
        actor = dict_of_interest['name']
        return actor
    else:
        pass

In [25]:
# DO NOT FORGET to manually increment the new col name here, e.g. 'cast0', 'cast1', etc.
credits_df['cast4'] = credits_df['cast'].apply(get_cast_member)

### Appending columns with director and screenwriters to the credits table:

In [26]:
def get_director(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (str): name of the first director from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            if lst_of_dicts[i]['job'] == 'Director':
                return lst_of_dicts[i]['name']
    else:
        pass

In [27]:
credits_df['director0'] = credits_df['crew'].apply(get_director)  #func in hidden cell above

In [28]:
def get_first_writer(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (str): name of the first screenwriter from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            if lst_of_dicts[i]['job'] == 'Screenplay' or \
            lst_of_dicts[i]['job'] == 'Writer' or \
            lst_of_dicts[i]['job'] == 'Author':
                return lst_of_dicts[i]['name']
    else:
        pass

In [29]:
credits_df['writer0'] = credits_df['crew'].apply(get_first_writer)  #func in hidden cell above

In [41]:
creds_drop_cols = ['cast', 'crew']
credits_df = credits_df.drop(creds_drop_cols, axis=1)

In [None]:
# def get_second_writer(lod_str):
#     '''
#     Function for use with pandas apply method.
#     --------
#     INPUT
#     lod_str (str): a list of dictionaries in the form of a string
#     --------
#     OUTPUT
#     (str): name of the second screenwriter from the list of dicts
#     '''
#     lst_of_dicts = eval(lod_str)
#     if len(lst_of_dicts) > 0:
#         count = 0
#         for i in range(len(lst_of_dicts)):
#             if lst_of_dicts[i]['job'] == 'Screenplay' and count == 0:
#                 count += 1
#             elif lst_of_dicts[i]['job'] == 'Screenplay' and count == 1:
#                 return lst_of_dicts[i]['name']
#     else:
#         pass

In [None]:
# credits_df['writer1'] = credits_df['crew'].apply(get_second_writer)
### decided not to get second writer becuse relatively few films have one in the data

### Dropping rows without at least 5 actors, or missing director, or missing writer:

In [47]:
credits_df = credits_df.dropna(subset=['cast4', 'director0', 'writer0'])

In [58]:
credits_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4499 entries, 0 to 4932
Data columns (total 8 columns):
id          4499 non-null object
cast0       4499 non-null object
cast1       4499 non-null object
cast2       4499 non-null object
cast3       4499 non-null object
cast4       4499 non-null object
director    4499 non-null object
writer0     4499 non-null object
dtypes: object(8)
memory usage: 316.3+ KB


### Next step...

#### Sandbox/seeing what some of the data looks like:

In [None]:
exp_credits_df = pd.read_csv(credits_file)

In [None]:
# 109, 83, 82, 5, 17, 18, 21 
# Author, Writer

crew_lod = eval(exp_credits_df.iat[21, 2])

In [None]:
for i in range(len(crew_lod)):
    print(crew_lod[i]['job'])
    print(crew_lod[i]['name'])
    print("")

In [40]:
credits_df.iat[21, 9]

'Michael Herr'