# Drafting functions/workflow for data cleaning + feature engineering, to be used on full data

## Imports

In [2]:
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 100)

In [3]:
os.getcwd();

In [4]:
info_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/for_prelim_cleaning/info_10k.csv'
credits_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/for_prelim_cleaning/credits_10k.csv'
releases_file = '/Users/ryanrappa/Desktop/dsi/film-profit-prediction/csv_files/for_prelim_cleaning/releases_10k.csv'

In [5]:
#info table contains basic film details like genre, budget, revenue, language, runtime, etc.
info_df = pd.read_csv(info_file)

#credits table contains cast and crew
credits_df = pd.read_csv(credits_file)

#releases table contains info on which countries each film was released in
releases_df = pd.read_csv(releases_file)

## Cleaning + new features

#### Because of the way data was collected, last row of each table has the headers. Fixing that:

In [6]:
info_df.columns = info_df.iloc[-1:, :].squeeze()
info_df = info_df.iloc[:-1, :]

In [7]:
credits_df.columns = credits_df.iloc[-1:, :].squeeze()
credits_df = credits_df.iloc[:-1, :]

In [8]:
releases_df.columns = releases_df.iloc[-1:, :].squeeze()
releases_df = releases_df.iloc[:-1, :]

### Cleaning info table

#### Getting each film's genre:

In [9]:
def get_first_genre(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (str): name of the first genre from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            return lst_of_dicts[i]['name']
    else:
        pass

In [10]:
info_df['genre'] = info_df['genres'].apply(get_first_genre)  #func in cell above

#### Dropping unnecessary rows and columns from info table:

In [11]:
#making sure relevant numerical columns are int/float type
info_df = info_df.astype({"budget": float, "id": int, "revenue": float, "runtime": float})

In [12]:
#filtering for only non-adult films,
# with revenue & budget > 0,
# original language english,
# runtime >= 80 minutes,
# and genre is not documentary.

info_df = info_df[(info_df['adult'] == 'False') & \
        (info_df['budget'] > 0) & \
        (info_df['revenue'] > 0) & \
        (info_df['original_language'] == 'en') & \
        (info_df['runtime'] >= 80.0) & \
        (info_df['genre'] != 'Documentary')]

In [13]:
#dropping columns from info_df that won't be needed for EDA
drop_cols = ['adult', 'backdrop_path', 'belongs_to_collection', 
             'genres', 'homepage', 'imdb_id', 'original_language', 'original_title', 
             'overview', 'popularity', 'poster_path', 'production_companies', 
             'production_countries', 'spoken_languages', 'status', 'tagline', 'video', 
             'vote_average', 'vote_count']

info_df = info_df.drop(drop_cols, axis=1)

### a few more cols to be dropped later for modelling, after EDA, feat. engineering, & joins: 
### ['id', 'release_date', 'runtime' (possibly), 'title', 'genre' (possibly)]

#### Creating columns for (a) amount of profit or loss (revenue minus budget) and (b) whether each movie profited (binary, whether revenue > budget):

In [14]:
info_df['profit'] = info_df['revenue'] - info_df['budget']

In [15]:
info_df['made_money'] = info_df['profit'] > 0
info_df['made_money'] = info_df['made_money'].astype('int')

In [16]:
#converting release date column from object type to datetime type
info_df['release_date'] = pd.to_datetime(info_df['release_date'])

### Cleaning credits table

#### Appending columns with top billed actors to the credits table:

In [17]:
def get_top_cast(df, lod_col, billings):
    '''
    INPUT
    df: pandas dataframe w/ stringified lists of dictionaries containing cast
    lod_col (str): name of col w/ stringified lists of dictionaries
    billings (int): number of cast members to extract from each lod
    --------
    OUTPUT
    original df w/ new columns having actor names corresponding to each billing 
    number in this function (lower billing number = more prominent
    role; 0 = star of the movie, 1 = costar, etc.)
    '''
    for c in range(billings):
        billing = c
        for i in range(len(df)):
            lst_of_dicts = eval(df.at[i, lod_col]) #lod's are strings at first, hence eval
            if len(lst_of_dicts) > billing:
                dict_of_interest = lst_of_dicts[billing]
                actor = dict_of_interest['name']
                df.at[i, 'cast_'+str(billing)] = actor
            else:
                continue
    return df

In [18]:
credits_df = get_top_cast(credits_df, 'cast', 5)  #func in cell above

#### Appending columns with director and screenwriters to the credits table:

In [19]:
def get_director(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (str): name of the first director from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            if lst_of_dicts[i]['job'] == 'Director':
                return lst_of_dicts[i]['name']
    else:
        pass

In [20]:
credits_df['director'] = credits_df['crew'].apply(get_director)  #func in cell above

In [21]:
def get_first_writer(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (str): name of the first screenwriter from the list of dicts
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        for i in range(len(lst_of_dicts)):
            if lst_of_dicts[i]['job'] == 'Screenplay' or \
            lst_of_dicts[i]['job'] == 'Writer' or \
            lst_of_dicts[i]['job'] == 'Author':
                return lst_of_dicts[i]['name']
    else:
        pass

In [22]:
credits_df['writer'] = credits_df['crew'].apply(get_first_writer)  #func in cell above

In [23]:
#decided not to get second writer becuse relatively few films have one in the dataset

In [24]:
creds_drop_cols = ['cast', 'crew']
credits_df = credits_df.drop(creds_drop_cols, axis=1)

In [25]:
#making sure id column is int so dataframes can be merged later
credits_df = credits_df.astype({"id": int})

#### Dropping rows without at least 5 actors, or missing director, or missing writer:

In [26]:
credits_df = credits_df.dropna(subset=['cast_4', 'director', 'writer'])

### Cleaning releases table

#### Getting number of countries each film was released in and dropping nulls:

In [27]:
def get_releases_count(lod_str):
    '''
    Function for use with pandas apply method.
    --------
    INPUT
    lod_str (str): a list of dictionaries in the form of a string
    --------
    OUTPUT
    (int): number of countries/territories the film was released in
    '''
    lst_of_dicts = eval(lod_str)
    if len(lst_of_dicts) > 0:
        return len(lst_of_dicts)
    else:
        pass

In [28]:
releases_df['releases'] = releases_df['countries'].apply(get_releases_count)  #func in cell above

In [29]:
releases_df = releases_df.dropna()

In [30]:
releases_df = releases_df.drop('countries', axis=1)

In [31]:
#making sure id column is int so dataframes can be merged later
releases_df = releases_df.astype({"id": int})

### Joining the tables

#### Note: when the keys to join on are pd columns, not indices, better to use merge method

In [32]:
step1 = info_df.merge(credits_df, on='id', how='inner')

In [33]:
df = step1.merge(releases_df, on='id', how='inner')

### Checking for outliers/faulty data (HAVE TO REEXAMINE THIS ON FULL DATA)

In [34]:
pd.set_option('float_format', '{:f}'.format)  #so that floats will print w/o scientific notat.

In [35]:
df.describe()  #checking for outliers/faulty data, looks like we have an issue in budget/revenue

Unnamed: 0,budget,id,revenue,runtime,profit,made_money,releases
count,1546.0,1546.0,1546.0,1546.0,1546.0,1546.0,1546.0
mean,38622310.793661,4280.022639,128294032.535576,114.152005,89671721.741915,0.801423,17.402975
std,40247799.140231,3758.514757,173168777.950504,21.962564,150251482.734078,0.399058,17.390917
min,1.0,5.0,5.0,80.0,-98301101.0,0.0,1.0
25%,10000000.0,790.25,22000000.0,99.0,3019205.0,1.0,6.0
50%,26000000.0,2469.5,65730000.0,110.0,36760682.0,1.0,10.0
75%,55000000.0,8871.75,163718070.25,124.0,115568858.5,1.0,19.0
max,380000000.0,9992.0,1845034188.0,248.0,1645034188.0,1.0,118.0


In [36]:
df = df[df['budget'] > 1000] #dropping rows with faulty budget numbers

In [37]:
df = df[df['revenue'] > 10300] #dropping rows with faulty revenue numbers

In [38]:
df.describe()  #much better (the 12k revenue and 12k budget are not mistakes)

Unnamed: 0,budget,id,revenue,runtime,profit,made_money,releases
count,1535.0,1535.0,1535.0,1535.0,1535.0,1535.0,1535.0
mean,38841753.859935,4281.883388,129138410.224104,114.202606,90296656.364169,0.803909,17.463844
std,40272976.340912,3758.806706,173477195.271823,21.999348,150567591.003754,0.397168,17.426354
min,12000.0,5.0,12000.0,80.0,-98301101.0,0.0,1.0
25%,10000000.0,791.0,22430661.5,99.0,3684759.5,1.0,6.0
50%,26000000.0,2486.0,66000000.0,110.0,37410000.0,1.0,10.0
75%,55000000.0,8873.0,163938407.5,124.0,118302504.5,1.0,19.0
max,380000000.0,9992.0,1845034188.0,248.0,1645034188.0,1.0,118.0


In [39]:
# resetting index for easier for-looping later
df = df.reset_index().drop('index', axis=1)

### Getting cast & crew lifetime revenue, profits, # of films, # of profitable films

In [40]:
def get_cast_metrics(df, cast_cols):
    '''
    Iterates over columns of cast members
    and generates new columns with the total
    revenue, profit, number of movies, and number of
    profitable movies that cast collectively has been in
    up until the date of release in each row.
    --------
    INPUT
    df: dataframe with cast columns
    cast_cols: list of column names (str) with the cast
    '''
    rev_cols = []
    prof_cols = []
    prof_count_cols = []
    tot_count_cols = []
    for col_name in cast_cols:
        for i in range(len(df)):
            #1. getting a dataframe with actor's movies to date only
            name = df.at[i, col_name]
            release_date = df.at[i, 'release_date']
            df_to_sum = df[(df['release_date'] < release_date) & \
                           ((df['cast_0'] == name) | (df['cast_1'] == name) | \
                            (df['cast_2'] == name) | (df['cast_3'] == name) | \
                            (df['cast_4'] == name))]
            #2. creating new cols for each actor's lifetime rev, prof, etc. to date
            df.at[i, str(col_name)+'_rev'] = df_to_sum['revenue'].sum()
            df.at[i, str(col_name)+'_prof'] = df_to_sum['profit'].sum()
            df.at[i, str(col_name)+'_films'] = df_to_sum['made_money'].count()
            df.at[i, str(col_name)+'_prof_films'] = df_to_sum['made_money'].sum()
        
        #2.1 appending those new col names to a list for subsequent summing and dropping
        rev_cols.append(str(col_name)+'_rev')
        prof_cols.append(str(col_name)+'_prof')
        prof_count_cols.append(str(col_name)+'_prof_films')
        tot_count_cols.append(str(col_name)+'_films')

    #3. summing/dropping those cols so we just have cols representing top5 cast as a whole
    
    rev_to_sum = df[rev_cols].copy()
    prof_to_sum = df[prof_cols].copy()
    prof_count_to_sum = df[prof_count_cols].copy()
    tot_count_to_sum = df[tot_count_cols].copy()
    
    df['cast_rev'] = rev_to_sum.sum(axis=1)
    df['cast_prof'] = prof_to_sum.sum(axis=1)
    df['cast_films'] = tot_count_to_sum.sum(axis=1)
    df['cast_prof_films'] = prof_count_to_sum.sum(axis=1)
    
    df = df.drop(rev_cols, axis=1)
    df = df.drop(prof_cols, axis=1)
    df = df.drop(prof_count_cols, axis=1)
    df = df.drop(tot_count_cols, axis=1)
    
    return df

In [41]:
cast_cols = ['cast_0', 'cast_1', 'cast_2', 'cast_3', 'cast_4']

#this may take a while to run, reason to use Spark on full data:
df = get_cast_metrics(df, cast_cols)  #func from cell above

In [42]:
def get_director_metrics(df, director_col):
    '''
    Iterates over column with director
    and generates new columns with the total
    revenue, profit, number of movies, and number of
    profitable movies the director has been in
    up until the date of release for each row.
    --------
    INPUT
    df: dataframe with director columns
    director_col: column name (str) with director
    '''
    for i in range(len(df)):
        #1. getting a dataframe with director's movies to date only
        name = df.at[i, director_col]
        release_date = df.at[i, 'release_date']
        df_to_sum = df[(df['release_date'] < release_date) & (df[director_col] == name)]
        
        #2. creating new cols for each director's lifetime rev, prof, etc. to date
        df.at[i, 'dir_rev'] = df_to_sum['revenue'].sum()
        df.at[i, 'dir_prof'] = df_to_sum['profit'].sum()
        df.at[i, 'dir_films'] = df_to_sum['made_money'].count()
        df.at[i, 'dir_prof_films'] = df_to_sum['made_money'].sum()
    
    return df

In [43]:
df = get_director_metrics(df, 'director')  #func from cell above

In [44]:
def get_writer_metrics(df, writer_col):
    '''
    Iterates over column with writer
    and generates new columns with the total
    revenue, profit, number of movies, and number of
    profitable movies the writer has been in
    up until the date of release for each row.
    --------
    INPUT
    df: dataframe with writer columns
    writer_col: column name (str) with writer
    '''
    for i in range(len(df)):
        #1. getting a dataframe with writer's movies to date only
        name = df.at[i, writer_col]
        release_date = df.at[i, 'release_date']
        df_to_sum = df[(df['release_date'] < release_date) & (df[writer_col] == name)]
        
        #2. creating new cols for each writer's lifetime rev, prof, etc. to date
        df.at[i, 'writ_rev'] = df_to_sum['revenue'].sum()
        df.at[i, 'writ_prof'] = df_to_sum['profit'].sum()
        df.at[i, 'writ_films'] = df_to_sum['made_money'].count()
        df.at[i, 'writ_prof_films'] = df_to_sum['made_money'].sum()
    
    return df

In [45]:
df = get_writer_metrics(df, 'writer')  #func from cell above

In [46]:
df.head()

Unnamed: 0,budget,id,release_date,revenue,runtime,title,genre,profit,made_money,cast_0,cast_1,cast_2,cast_3,cast_4,director,writer,releases,cast_rev,cast_prof,cast_films,cast_prof_films,dir_rev,dir_prof,dir_films,dir_prof_films,writ_rev,writ_prof,writ_films,writ_prof_films
0,4000000.0,5,1995-12-09,4300000.0,98.0,Four Rooms,Crime,300000.0,1,Tim Roth,Antonio Banderas,Jennifer Beals,Madonna,Marisa Tomei,Allison Anders,Allison Anders,30.0,695985089.0,471985089.0,7.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11000000.0,11,1977-05-25,775398007.0,121.0,Star Wars,Adventure,764398007.0,1,Mark Hamill,Harrison Ford,Carrie Fisher,Peter Mayhew,Anthony Daniels,George Lucas,George Lucas,67.0,4420000.0,2820000.0,1.0,1.0,142437000.0,140883000.0,2.0,2.0,142437000.0,140883000.0,2.0,2.0
2,94000000.0,12,2003-05-30,940335536.0,100.0,Finding Nemo,Animation,846335536.0,1,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe,Brad Garrett,Andrew Stanton,Andrew Stanton,62.0,1700465487.0,1222665487.0,12.0,9.0,0.0,0.0,0.0,0.0,1423441984.0,1098441984.0,3.0,3.0
3,55000000.0,13,1994-07-06,677945399.0,142.0,Forrest Gump,Comedy,622945399.0,1,Tom Hanks,Robin Wright,Gary Sinise,Mykelti Williamson,Sally Field,Robert Zemeckis,Eric Roth,39.0,1239738313.0,1060738313.0,9.0,8.0,1523036191.0,1289036191.0,6.0,6.0,0.0,0.0,0.0,0.0
4,15000000.0,14,1999-09-15,356296601.0,122.0,American Beauty,Drama,341296601.0,1,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley,Mena Suvari,Sam Mendes,Alan Ball,54.0,1799337971.0,1159337971.0,13.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Getting average of cast & crew revenue, profit, etc. for each film's competitors (i.e. all other films released with +/- 2 weeks of each film):

In [49]:
def get_competitor_metrics(df, id_col):
    '''
    Iterates over each film (each row)
    and generates new columns with the average
    revenue, profit, number of movies, and number of
    profitable movies of the casts/crews from competing movies
    (i.e. those released within +/- 2 weeks) have been in.
    --------
    INPUT
    df: dataframe with cast columns
    id_col: column name (str) with film id
    '''
    for i in range(len(df)):
        #1. getting a dataframe with competing films
        id_ = df.at[i, id_col]
        release_date = df.at[i, 'release_date']
        df_to_avg = df[(df['release_date'] >= (release_date - pd.to_timedelta(14, unit='d'))) & \
               (df['release_date'] <= (release_date + pd.to_timedelta(14, unit='d'))) & \
               (df['id'] != id_)]
        
        #2. creating new cols for competing cast/crews' avg lifetime rev, prof, etc.
        df.at[i, 'compet_cast_rev'] = df_to_avg['cast_rev'].mean()
        df.at[i, 'compet_cast_prof'] = df_to_avg['cast_prof'].mean()
        df.at[i, 'compet_cast_films'] = df_to_avg['cast_films'].mean()
        df.at[i, 'compet_cast_prof_films'] = df_to_avg['cast_prof_films'].mean()
        
        df.at[i, 'compet_dir_rev'] = df_to_avg['dir_rev'].mean()
        df.at[i, 'compet_dir_prof'] = df_to_avg['dir_prof'].mean()
        df.at[i, 'compet_dir_films'] = df_to_avg['dir_films'].mean()
        df.at[i, 'compet_dir_prof_films'] = df_to_avg['dir_prof_films'].mean()
        
        df.at[i, 'compet_writ_rev'] = df_to_avg['writ_rev'].mean()
        df.at[i, 'compet_writ_prof'] = df_to_avg['writ_prof'].mean()
        df.at[i, 'compet_writ_films'] = df_to_avg['writ_films'].mean()
        df.at[i, 'compet_writ_prof_films'] = df_to_avg['writ_prof_films'].mean()
    
    return df

In [50]:
df = get_competitor_metrics(df, 'id')

In [53]:
df.head()

Unnamed: 0,budget,id,release_date,revenue,runtime,title,genre,profit,made_money,cast_0,cast_1,cast_2,cast_3,cast_4,director,writer,releases,cast_rev,cast_prof,cast_films,cast_prof_films,dir_rev,dir_prof,dir_films,dir_prof_films,writ_rev,writ_prof,writ_films,writ_prof_films,compet_cast_rev,compet_cast_prof,compet_cast_films,compet_cast_prof_films,compet_dir_rev,compet_dir_prof,compet_dir_films,compet_dir_prof_films,compet_writ_rev,compet_writ_prof,compet_writ_films,compet_writ_prof_films
0,4000000.0,5,1995-12-09,4300000.0,98.0,Four Rooms,Crime,300000.0,1,Tim Roth,Antonio Banderas,Jennifer Beals,Madonna,Marisa Tomei,Allison Anders,Allison Anders,30.0,695985089.0,471985089.0,7.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1420161956.333333,1124717062.5,11.833333,10.0,135950088.666667,94200088.666667,1.0,0.666667,12920277.666667,5670277.666667,0.333333,0.166667
1,11000000.0,11,1977-05-25,775398007.0,121.0,Star Wars,Adventure,764398007.0,1,Mark Hamill,Harrison Ford,Carrie Fisher,Peter Mayhew,Anthony Daniels,George Lucas,George Lucas,67.0,4420000.0,2820000.0,1.0,1.0,142437000.0,140883000.0,2.0,2.0,142437000.0,140883000.0,2.0,2.0,,,,,,,,,,,,
2,94000000.0,12,2003-05-30,940335536.0,100.0,Finding Nemo,Animation,846335536.0,1,Albert Brooks,Ellen DeGeneres,Alexander Gould,Willem Dafoe,Brad Garrett,Andrew Stanton,Andrew Stanton,62.0,1700465487.0,1222665487.0,12.0,9.0,0.0,0.0,0.0,0.0,1423441984.0,1098441984.0,3.0,3.0,1027995470.0,664456581.111111,8.666667,6.888889,100159453.444444,68126120.111111,1.111111,1.0,4447986.555556,3025764.333333,0.111111,0.111111
3,55000000.0,13,1994-07-06,677945399.0,142.0,Forrest Gump,Comedy,622945399.0,1,Tom Hanks,Robin Wright,Gary Sinise,Mykelti Williamson,Sally Field,Robert Zemeckis,Eric Roth,39.0,1239738313.0,1060738313.0,9.0,8.0,1523036191.0,1289036191.0,6.0,6.0,0.0,0.0,0.0,0.0,767653967.0,595353967.0,9.0,8.0,21456340.0,-24543660.0,2.0,0.0,185538662.0,100538662.0,2.0,2.0
4,15000000.0,14,1999-09-15,356296601.0,122.0,American Beauty,Drama,341296601.0,1,Kevin Spacey,Annette Bening,Thora Birch,Wes Bentley,Mena Suvari,Sam Mendes,Alan Ball,54.0,1799337971.0,1159337971.0,13.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,879012772.4,672532772.4,6.6,6.0,24103649.6,20203649.6,0.6,0.4,24103649.6,20203649.6,0.6,0.4


### Next step: run this on full data (see next notebook - "4_full_data_cleaning.ipynb")