In [1]:
import pandas as pd
import requests
import traceback
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

##### Load the previously webscraped clean data

In [2]:
web_scraped_data = pd.read_csv("./project_datasets/clean-webscraped.csv")

##### Step 1: Reading API data using "requests" library available in Python

Before invoking the API, you need to signup for your [TMDB](https://www.themoviedb.org/) account and get your api key to work with tmdb API.

In [3]:
api_key = ''

In [4]:
genre_url=f'https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}'
genre_dict = dict()
try:
    response = requests.get(genre_url)
    if response.status_code == 200:
        genre_dict = { genre_json['id']:genre_json['name'] for genre_json in response.json()['genres'] }
except Exception as e:
    print('Error downloading genres: ',e)
    traceback.print_exc()

#Lets add a unique key for movies without any genre
if 0 not in genre_dict.keys():
    genre_dict[0] = 'unknown'


In [5]:
#Genre Names
genre_dict.values()

dict_values(['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western', 'unknown'])

In [6]:
import json
#remove later
json.dump( genre_dict, open( "./project_datasets/genre_dict.json", 'w' ) )


In [120]:
# Read data from file:
#remove later
genre_dict = json.load( open( "./project_datasets/genre_dict.json") )
genre_dict

{'28': 'Action',
 '12': 'Adventure',
 '16': 'Animation',
 '35': 'Comedy',
 '80': 'Crime',
 '99': 'Documentary',
 '18': 'Drama',
 '10751': 'Family',
 '14': 'Fantasy',
 '36': 'History',
 '27': 'Horror',
 '10402': 'Music',
 '9648': 'Mystery',
 '10749': 'Romance',
 '878': 'Science Fiction',
 '10770': 'TV Movie',
 '53': 'Thriller',
 '10752': 'War',
 '37': 'Western',
 '0': 'unknown'}

In [7]:
scraped_movie_list=web_scraped_data['movie_title'].unique()
len(scraped_movie_list)

353

In [177]:
#remove later
api_df = pd.read_pickle("raw_api_df_pickle")

In [8]:
%%time
# We will first create an empty dataframe to store all the movie detail
api_df = pd.DataFrame()
# Our for loop will iterate through each page, get json data convert it into dataframe and append it to original dataframe
for title in scraped_movie_list:
    url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={title}"
    response = requests.get(url)
    if 'results' in response.json():
        temporary_df = pd.DataFrame(response.json()['results'])
        api_df = pd.concat([api_df,temporary_df],ignore_index=True)

CPU times: total: 438 ms
Wall time: 40.2 s


In [9]:
api_df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/npCPnwDyWfQltGfIZKN6WqeUXGI.jpg,"[14, 12, 28]",57158,en,The Hobbit: The Desolation of Smaug,"The Dwarves, Bilbo and Gandalf have successful...",68.486,/xQYiXsheRCDBA39DOrmaw1aSpbk.jpg,2013-12-11,The Hobbit: The Desolation of Smaug,False,7.575,13167.0
1,False,/u2bZhH3nTf0So0UIC1QxAqBvC07.jpg,"[16, 10751, 12, 14]",109445,en,Frozen,Young princess Anna of Arendelle dreams about ...,136.92,/mmWheq3cFI4tYrZDiATOkCNTqgK.jpg,2013-11-20,Frozen,False,7.2,16539.0
2,False,/vD1yKObsRS2cvpmtuaCaMhr4zxe.jpg,[53],44363,en,Frozen,When three skiers find themselves stranded on ...,21.316,/2J3URUnDrIpNvh0uVqINQvr4HhW.jpg,2010-02-05,Frozen,False,6.0,1820.0
3,False,/sGjhRHNiQSkgVec18D3oX45hPmz.jpg,[53],26041,en,Frozen,It's two years since the mysterious disappeara...,2.648,/a6RlPQUerliQLkAieku5B8Loamk.jpg,2005-03-12,Frozen,False,5.7,18.0
4,False,,"[18, 27]",170986,hi,Frozen,This is a touching and somber journey of Lasya...,1.448,/2GL9yZtrgbYKCeKBc3TF9gGfZpX.jpg,2007-07-21,Frozen,False,7.2,4.0


In [10]:
#remove later
#api_df.to_pickle("raw_api_df_pickle")

In [11]:
api_df.shape

(2994, 14)

##### Check if the genre_ids have any empty data like [] as genre_ids column is a list

In [12]:
api_df.genre_ids.value_counts()

genre_ids
[]                          325
[99]                        262
[18]                        188
[35]                        132
[27]                        123
                           ... 
[99, 36, 80]                  1
[80, 53, 28]                  1
[10770, 28, 12, 14, 878]      1
[28, 16, 14, 878]             1
[16, 14, 35]                  1
Name: count, Length: 766, dtype: int64

Response object now has all our needed details but they are in the form of JSON. We can use json() to view these details

#### Now if you observe JSON carefully it has four keys

- page: shows page no. of current page
- results: contains list of dictionary with 20 movies
- total_pages: total pages present on site which contains the detail about movies
- total_results: total popular movies

#### We can convert this JSON into pandas dataframe
- Note: we are converting only 'results' dictionary into dataframe as we only need movie details

##### pandas replace empty square brackets with [0], here genre[0] is unknown which we created earlier

In [13]:
#
api_df['genre_ids'] = api_df['genre_ids'].apply(lambda  x : [0] if not x else x)

In [14]:
api_df.genre_ids.value_counts()

genre_ids
[0]                         325
[99]                        262
[18]                        188
[35]                        132
[27]                        123
                           ... 
[99, 36, 80]                  1
[80, 53, 28]                  1
[10770, 28, 12, 14, 878]      1
[28, 16, 14, 878]             1
[16, 14, 35]                  1
Name: count, Length: 766, dtype: int64

##### First drop na columns then map the genres

In [16]:
#First drop na columns
api_df = api_df.dropna(subset=['genre_ids'])
#then map the genres
api_df['genre_ids'] = api_df['genre_ids'].apply(lambda x: [genre_dict[str(i)] for i in x ])

In [17]:
#removing brackets from list type inside pandas cell
api_df['genre_ids'] = api_df['genre_ids'].str.join(',')

In [18]:
api_df['genre_ids'].value_counts()

genre_ids
unknown                                              325
Documentary                                          262
Drama                                                188
Comedy                                               132
Horror                                               123
                                                    ... 
Documentary,History,Crime                              1
Crime,Thriller,Action                                  1
TV Movie,Action,Adventure,Fantasy,Science Fiction      1
Action,Animation,Fantasy,Science Fiction               1
Animation,Fantasy,Comedy                               1
Name: count, Length: 766, dtype: int64

In [31]:
#rename columns like genre_ids to genre_names
#drop un-neccessary columns
#extract year as a separate column from release_date
# rearrange columns
#save the cleaned data


In [19]:
api_df['release_date']=pd.to_datetime(api_df['release_date'],format="%Y-%m-%d")
api_df["release_year"] = pd.to_datetime(api_df['release_date'], format="%Y-%m-%d").dt.year
api_df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,release_year
0,False,/npCPnwDyWfQltGfIZKN6WqeUXGI.jpg,"Fantasy,Adventure,Action",57158,en,The Hobbit: The Desolation of Smaug,"The Dwarves, Bilbo and Gandalf have successful...",68.486,/xQYiXsheRCDBA39DOrmaw1aSpbk.jpg,2013-12-11,The Hobbit: The Desolation of Smaug,False,7.575,13167.0,2013.0
1,False,/u2bZhH3nTf0So0UIC1QxAqBvC07.jpg,"Animation,Family,Adventure,Fantasy",109445,en,Frozen,Young princess Anna of Arendelle dreams about ...,136.92,/mmWheq3cFI4tYrZDiATOkCNTqgK.jpg,2013-11-20,Frozen,False,7.2,16539.0,2013.0
2,False,/vD1yKObsRS2cvpmtuaCaMhr4zxe.jpg,Thriller,44363,en,Frozen,When three skiers find themselves stranded on ...,21.316,/2J3URUnDrIpNvh0uVqINQvr4HhW.jpg,2010-02-05,Frozen,False,6.0,1820.0,2010.0
3,False,/sGjhRHNiQSkgVec18D3oX45hPmz.jpg,Thriller,26041,en,Frozen,It's two years since the mysterious disappeara...,2.648,/a6RlPQUerliQLkAieku5B8Loamk.jpg,2005-03-12,Frozen,False,5.7,18.0,2005.0
4,False,,"Drama,Horror",170986,hi,Frozen,This is a touching and somber journey of Lasya...,1.448,/2GL9yZtrgbYKCeKBc3TF9gGfZpX.jpg,2007-07-21,Frozen,False,7.2,4.0,2007.0


In [20]:
#Check if there are any nan in release year
api_df.loc[api_df.release_year.isna()]

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,release_year
5,False,,unknown,950554,en,Frozen,A film by Adonia Bouchehri.,0.437,,NaT,Frozen,False,0.0,0.0,
32,False,,unknown,566990,en,Gravity,Three boys power play with a gun. Gravity is a...,0.001,,NaT,Gravity,False,0.0,0.0,
74,False,/xcIqtToUFrie1o4g4ZtYVKj5R1f.jpg,"Science Fiction,Action",374771,en,Riddick: Furya,"Riddick finally returns to his home world, a p...",9.035,,NaT,Riddick: Furya,False,0.0,0.0,
88,False,,"Comedy,Romance",887567,en,The Butler,A story of a butler who is in love with their ...,0.001,,NaT,The Butler,False,0.0,0.0,
113,False,,unknown,1159811,ko,극락전,"Srey Na, a female immigrant from Cambodia, who...",0.021,,NaT,The Road to Elysium,False,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,False,,"Action,Adventure,Science Fiction",939345,en,Transformers: Rise of the Beasts 2,The first of two planned sequels to the 2023 f...,18.436,/f4PFiwOHVcNUXRcOmxX2hUYdAx7.jpg,NaT,Transformers: Rise of the Beasts 2,False,0.0,0.0,
2912,False,,"Action,Adventure,Science Fiction,Fantasy",939347,en,Transformers: Rise of the Beasts 3,The second of two planned sequels to the 2023 ...,13.508,/zjDGpjRj9M9pLqVVZPpaFhG6BLx.jpg,NaT,Transformers: Rise of the Beasts 3,False,0.0,0.0,
2934,False,,"Action,Crime,Thriller",755679,en,Fast X: Part 2,"The eleventh installment in The Fast Saga, whi...",73.241,/lD8V3DBban95Mxz4sjuA81Tw771.jpg,NaT,Fast X: Part 2,False,0.0,0.0,
2961,False,,unknown,1195165,en,Dungeons & Derrick,"Derrick and Tori, best friends and avid player...",0.001,,NaT,Dungeons & Derrick,False,0.0,0.0,


In [21]:
#Fill in those nan with some arbitrary value so you can avoid any type conversion errors
api_df['release_year'] = api_df.release_year.fillna(1900)

In [22]:
#conver floats to ints now
api_df["release_year"] = api_df.release_year.astype('int64')

In [23]:
# checks if any of columns in the data have null values - should print False
api_df.isnull().sum().any()

True

In [24]:
api_df.dropna(inplace=True)

In [25]:
api_df.shape

(1896, 15)

In [26]:
api_df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,release_year
0,False,/npCPnwDyWfQltGfIZKN6WqeUXGI.jpg,"Fantasy,Adventure,Action",57158,en,The Hobbit: The Desolation of Smaug,"The Dwarves, Bilbo and Gandalf have successful...",68.486,/xQYiXsheRCDBA39DOrmaw1aSpbk.jpg,2013-12-11,The Hobbit: The Desolation of Smaug,False,7.575,13167.0,2013
1,False,/u2bZhH3nTf0So0UIC1QxAqBvC07.jpg,"Animation,Family,Adventure,Fantasy",109445,en,Frozen,Young princess Anna of Arendelle dreams about ...,136.92,/mmWheq3cFI4tYrZDiATOkCNTqgK.jpg,2013-11-20,Frozen,False,7.2,16539.0,2013
2,False,/vD1yKObsRS2cvpmtuaCaMhr4zxe.jpg,Thriller,44363,en,Frozen,When three skiers find themselves stranded on ...,21.316,/2J3URUnDrIpNvh0uVqINQvr4HhW.jpg,2010-02-05,Frozen,False,6.0,1820.0,2010
3,False,/sGjhRHNiQSkgVec18D3oX45hPmz.jpg,Thriller,26041,en,Frozen,It's two years since the mysterious disappeara...,2.648,/a6RlPQUerliQLkAieku5B8Loamk.jpg,2005-03-12,Frozen,False,5.7,18.0,2005
6,False,/9PxXSAnbVfvFacsGTJu1aXEWVg7.jpg,"Animation,Adventure,Comedy,Family",573171,es,Huevitos Congelados,"In the final Huevos adventure, Toto and his fa...",33.602,/8xCO3IarklLD4tK1rPn0e4gSMoV.jpg,2022-12-14,Little Eggs: A Frozen Rescue,False,7.7,345.0,2022


In [27]:
api_df.columns

Index(['adult', 'backdrop_path', 'genre_ids', 'id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'title', 'video', 'vote_average', 'vote_count',
       'release_year'],
      dtype='object')

In [None]:
#### Step : Convert key column title to lowercase

In [28]:
api_df['title'] = api_df.title.str.lower()

#### Step : Drop unused columns

In [29]:
api_df.drop(['id','adult','backdrop_path','poster_path','video'], axis=1, inplace=True)

In [30]:
api_df.rename(columns={'genre_ids': 'genres'}, inplace=True)

In [31]:
new_col_order=['title', 'release_year','genres', 'popularity', 'vote_average','vote_count', 'original_title', 'overview'
                ,'release_date','original_language'
               ]

for i,col in enumerate(new_col_order):
    tmp = api_df[col]
    api_df.drop(labels=[col],axis=1,inplace=True)
    api_df.insert(i,col,tmp)

sum(api_df.duplicated())
api_df.drop_duplicates(inplace=True)    

In [32]:
api_df.head()

Unnamed: 0,title,release_year,genres,popularity,vote_average,vote_count,original_title,overview,release_date,original_language
0,the hobbit: the desolation of smaug,2013,"Fantasy,Adventure,Action",68.486,7.575,13167.0,The Hobbit: The Desolation of Smaug,"The Dwarves, Bilbo and Gandalf have successful...",2013-12-11,en
1,frozen,2013,"Animation,Family,Adventure,Fantasy",136.92,7.2,16539.0,Frozen,Young princess Anna of Arendelle dreams about ...,2013-11-20,en
2,frozen,2010,Thriller,21.316,6.0,1820.0,Frozen,When three skiers find themselves stranded on ...,2010-02-05,en
3,frozen,2005,Thriller,2.648,5.7,18.0,Frozen,It's two years since the mysterious disappeara...,2005-03-12,en
6,little eggs: a frozen rescue,2022,"Animation,Adventure,Comedy,Family",33.602,7.7,345.0,Huevitos Congelados,"In the final Huevos adventure, Toto and his fa...",2022-12-14,es


In [33]:
api_df.to_csv(r'./project_datasets/clean-api_data.csv',index=False)