In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib
from matplotlib import pyplot as plt
import nltk

# Set working directory
os.chdir("/Users/Sam Edds/Desktop/Stats_551")

Connect to Bechdel Test API and pull down all movies

In [None]:
#Pull down all movies from the Bechdel Test API
import json
import requests
url = "http://bechdeltest.com/api/v1/getAllMovies"
r = requests.get(url)
all_movies = r.json()

# Put into pandas df and output text file
df_movies = pd.DataFrame(all_movies)
df_movies.to_csv("all_bechdel.txt")

Now with Bechdel and IMDb information look at variables for abberations

In [None]:
# Read in merged file from Katherine (all_bechdel is merged with IMDb data)
bechdel_full = pd.read_csv("bechdel_full.csv")

# Look at full data set for missingness and variable types
bechdel_full.info()

In [None]:
# Remove anything that is not movies (7,521 total obs)
bechdel_full = bechdel_full[bechdel_full['titleType'] == 'movie']


In [None]:
# Check if primary title and title do not match
bechdel_full['title_match'] = bechdel_full['title'] == bechdel_full['primaryTitle']
# Look at differences in title and primary Title
bechdel_full['title_match'] = bechdel_full['title_match'].astype('str')
bechdel_full[bechdel_full.title_match.str.contains("False")]

In [None]:
# summarize 'isAdult'
bechdel_full.groupby('isAdult').agg(['count'])
bechdel_full[bechdel_full['isAdult'] == 1]
# Remove from dataset- just 1 adult content movie
bechdel_full = bechdel_full[bechdel_full['isAdult'] != 1]

In [None]:
# Make startYear and year into strings
bechdel_full['startYear'] = bechdel_full.startYear.astype('str')
bechdel_full['year'] = bechdel_full.year.astype('str')

# is startYear different from year? Yes...keep the IMDb variable
bechdel_full[bechdel_full['year'] != bechdel_full['startYear']]

In [None]:
# Remove a few with missing genre
bechdel_full = bechdel_full[bechdel_full['genres'] != '\\N']

# Remove dupes
bechdel_full = bechdel_full.drop_duplicates(['imdbid'], keep = 'first')

# Make sure 'Terms of Endearment' is a 3 (Bechdel test website note confirmed)
bechdel_full['rating'] = np.where(bechdel_full['primaryTitle'] == 'Terms of Endearment', 3, bechdel_full['rating'])

Make genre variable into a wide dataframe

In [None]:
# Parse genre into a wide dataframe of variables

# First reset index
bechdel_full = bechdel_full.reset_index()

# Tokenize genres and make into list of lists
def tokenize(df):  
    genres = df['genres'].tolist()
    tks = [nltk.word_tokenize(x) for x in genres]

    movie_genre = list()

    # Make into a wide 
    for title in tks:
        # Make a new dictionary and append at the end of each movie
        word_cnt = dict()
        for word in title:
            if not word in word_cnt:
                word_cnt[word] = 1
            else:
                word_cnt[word] += 1
        # Append title specific dictionary to list of all dictionaries
        movie_genre.append(word_cnt)
    
    # Wide df with each genre
    genre_df = pd.DataFrame(movie_genre)
    genre_df = genre_df.fillna(0)
    return genre_df

# Call function on dataframe
genre_df = tokenize(bechdel_full)


In [None]:
# Join by index
bechdel_wide = bechdel_full.join(genre_df)
# Check number of obs
print(len(bechdel_wide))

Add Decade and Pass variables

In [None]:
# Add decade variables
bechdel_wide['decade'] = bechdel_wide['startYear'].str[:3] + '0'
bechdel_wide['decade'] = np.where(bechdel_wide['decade'] == '1900', '1900-1920',
                                  np.where(bechdel_wide['decade'] == '1910', '1900-1920',
                                           np.where(bechdel_wide['decade'] == '1920', '1900-1920', bechdel_wide['decade'])))

In [None]:
# Binary start/end date
bechdel_wide['pass'] = np.where(bechdel_wide['rating'] == 3, 1, 0)
bechdel_wide['notpass'] = np.where(bechdel_wide['rating'] == 3, 0, 1)

In [None]:
# Rename unicode column names
bechdel_wide = bechdel_wide.rename(columns = {bechdel_wide.columns[15] : 'n_genre'})

In [None]:
# Remove missing start date
bechdel_wide = bechdel_wide[bechdel_wide['startYear'] != '\\N']
bechdel_wide = bechdel_wide.rename(columns = {'startYear' : 'year'})

In [None]:
# Clean up runtime Minutes
bechdel_wide = bechdel_wide[bechdel_wide['runtimeMinutes'] != '\\N']
bechdel_wide['runtimeMinutes'] = bechdel_wide['runtimeMinutes'].astype(int)

In [None]:
# Drop useless variables
bechdel_wide = bechdel_wide.drop(['tconst', 'V1', 'id', 'title', 'year',
                                  'isAdult', 'genres', 'titleType', 'n_genre',
                                  'title_match', 'News'], axis = 1)

In [None]:
# Output cleaned data set
bechdel_wide.to_csv("bechdel_cleaned.csv")

In [None]:
# Want genre proportions information 

# Drop additional variables
bechdel_trans = bechdel_wide.drop(['index','imdbid', 'year', 'runtimeMinutes', 'decade',
                                   'rating', 'primaryTitle'], axis = 1)
# Transpose
bechdel_trans = bechdel_trans.transpose()
# Sum Totals
total = bechdel_trans.sum(axis = 1)

# Passed sums only
passed =  bechdel_wide.drop(['index','imdbid', 'year', 'runtimeMinutes', 'decade',
                                   'rating', 'primaryTitle'], axis = 1)
passed = passed[passed['pass'] == 1]
passed = passed.transpose()
total_passed = passed.sum(axis = 1)

# Combine into dataframe and output for Olivia
total = pd.DataFrame(total)
total_passed = pd.DataFrame(total_passed)
passed_data = pd.concat([total, total_passed], axis = 1)
passed_data.to_csv("genre_passed.csv")
