In [21]:
# import dependencies
import requests
import pandas as pd
from config import api_key

In [22]:
# import keywords and antikeywords that we'll use to filter our movies, 

#Set parameters:
name = 'BC'
keywords = 'resources/BC_keywords.csv'
antikeywords = 'resources/BC_antikeywords.csv'

In [23]:
# Function for extracting movies using a keyword and antikeyword list
def get_movies(name, keywords, antikeywords):

    # import keywords and antikeywords that we'll use to filter our movies, 
    # keywords includes an estimated date range.
    keyword_df = pd.read_csv(keywords)
    antikeyword_df = pd.read_csv(antikeywords)

    # Production 
    pages = list(range(1,500))

    # Test
    # pages = list(range(1,10))

    movie_df = pd.DataFrame()

    for year in range(1945, 2022):
    # for year in range(2015, 2022):

        x = 0

        for page in pages:
            movie_df_length = len(movie_df)
            movie_data = pd.DataFrame(requests.get(f'https://api.themoviedb.org/3/discover/movie?api_key={api_key}&primary_release_date.gte={year}-01-01&without_genres=99,878,16,27&primary_release_date.lte={year}-12-31&vote_count.gte=0&vote_average.gte=3.9&with_runtime.gte=55&sort_by=release_date.asc&page={page}').json()['results'])
            movie_df = movie_df.append(movie_data)

            x += 1
            length = len(movie_df.index)
            
            if len(movie_df) == movie_df_length:
                break
        
        if year % 5 == 0 or x > 70:
            print(f'Currently passing year:{year}, with: {x} pages and {length} results')

    print(f'Raw Results: {length}')

    movie_df.reset_index(drop = True, inplace=True)
    movie_df.drop(columns=['adult','backdrop_path','original_language','original_title','poster_path','video'], inplace=True)
    movie_df = pd.DataFrame(movie_df.loc[movie_df.astype(str).drop_duplicates(subset='title').index])

    #Change Column name in keyword_df
    keyword_df.rename({'Topic/Keyword':'keyword'}, axis=1, inplace=True)

    # get keyword and antikeyword lists for loops
    key_list = keyword_df.keyword.to_list()
    antikey_list = antikeyword_df.BC_anti_keywords.to_list()

    # Create a revised_movie_df to hold movies that meet the keyword criteria
    column_list = movie_df.columns.to_list()

    revised_movie_df = pd.DataFrame(columns=column_list)

    keyword_column = []
    dates_column = []
    i = 0

    # Search movie overview for keywords, if a keyword is found the movie is added to the list.
    for movie in movie_df.overview:
        key_holder = []
        date_holder = []
        j = 0
        k = 0

        for keyword in key_list:
            if keyword in movie:
                date_holder.append(keyword_df['Start Date'][j])
                key_holder.append(keyword)
                if len(key_holder) == 1 and k == 0:
                    try:
                        revised_movie_df = revised_movie_df.append(movie_df.iloc[[i]])
                        k += 1
                    except KeyError:
                        print('KeyError found')
            j += 1
        if len(key_holder) != 0:
            keyword_column.append(key_holder)
            dates_column.append(date_holder)
        i += 1

    # found keywords and estimated dates (based on event, location, or person) are added to the new df
    revised_movie_df['keyword'] = keyword_column
    revised_movie_df['est_date'] = dates_column

    revised_movie_df.reset_index(drop = True, inplace=True)

    # Cycle through new df and find words that should exculde the movie (moder terms or words taht fit outside desired timeline)
    i_2 = 0
    drop_list = []

    for movie in revised_movie_df.overview:
        for antikey in antikey_list:   
            if antikey in movie:
                drop_list.append(i_2) 
                break      
        i_2 += 1
    revised_movie_df = revised_movie_df.drop(drop_list,axis=0)

    revised_movie_df.reset_index(drop = True, inplace=True)

    revised_movie_df.to_csv(f'resources/{name}_movie_list.csv')

    return revised_movie_df

In [24]:
revised_movie_df = get_movies(name, keywords, antikeywords)

Currently passing year:1945, with: 22 pages and 410 results
Currently passing year:1950, with: 33 pages and 3073 results
Currently passing year:1955, with: 38 pages and 6355 results
Currently passing year:1960, with: 41 pages and 10316 results
Currently passing year:1965, with: 43 pages and 14423 results
Currently passing year:1970, with: 55 pages and 19290 results
Currently passing year:1975, with: 49 pages and 24495 results
Currently passing year:1980, with: 54 pages and 29693 results
Currently passing year:1985, with: 60 pages and 35212 results
Currently passing year:1990, with: 64 pages and 41259 results
Currently passing year:1995, with: 69 pages and 47700 results
Currently passing year:1996, with: 71 pages and 49096 results
Currently passing year:1997, with: 74 pages and 50543 results
Currently passing year:1998, with: 81 pages and 52133 results
Currently passing year:1999, with: 83 pages and 53754 results
Currently passing year:2000, with: 89 pages and 55500 results
Currently pa