# Webscraping from IMDB

* Goal: Webscrape movie title, synopsis and genre from IMDB. 
* Ensure that each genre is well sampled

## Set-up

### Import Libraries

In [25]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Initializing 

In [1]:
# Initializing data fields list
movie_title_list = []
movie_genre_list = []
movie_synopsis_list = []

# Initializing helpers to keep track of genre distribution
movie_title_set = set()
movie_genre_dict = {}

### Helper Functions to Fetch and Clean Movie Data

In [27]:
# Get Movie Title
def getMovieTitle(movie):
  try: 
    return movie.find(class_="lister-item-header").find("a").getText()
  except:
    return "NA"

# Gets Movie Synopsis
def getMovieSynopsis(movie):
  try: 
    return movie.find("div", class_="lister-item-content").find_all("p")[1].getText().strip().rstrip()
  except:
    return "NA"

# Gets Movie Genre
def getMovieGenre(movie):
  try: 
    return movie.find(class_="genre").getText().strip().rstrip()
  except:
    return "NA"

### Main Function to Fetch Movie Data

In [55]:
# Fetch Movie Data and adds them to the data fields lists
def fetchData(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html')
    
    title_list = []
    genre_list = []
    synopsis_list = []
    
    movies_list = soup.find_all("div", class_="lister-item mode-advanced")
    for movie in movies_list:
        movie_title = getMovieTitle(movie)
        if movie_title in movie_title_set: 
            continue
            
        movie_title_set.add(movie_title)
        movie_title_list.append(movie_title)
        
        movie_genres_string = getMovieGenre(movie)
        genres = list(movie_genres_string.split(", "))
        for genre in genres:
            if genre in movie_genre_dict:
                movie_genre_dict[genre] += 1
            else:
                movie_genre_dict[genre] = 1
        movie_genre_list.append(movie_genres_string)
        
        movie_synopsis = getMovieSynopsis(movie)
        movie_synopsis_list.append(movie_synopsis)
    
    response.close()
    soup.decompose()

## Fetching Movies Data

Data was fetched by repeatedly looking at the genre frequency distribution and updating the url list to see which genres need more titles.

In [82]:
url_list = [
    "https://www.imdb.com/search/title/?genres=action&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_1",
    "https://www.imdb.com/search/title/?genres=adventure&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_2",
    "https://www.imdb.com/search/title/?genres=animation&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_3",
    "https://www.imdb.com/search/title/?title_type=feature&genres=animation&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=biography&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_4",
    "https://www.imdb.com/search/title/?genres=comedy&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_5",
    "https://www.imdb.com/search/title/?genres=family&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_9",
    "https://www.imdb.com/search/title/?genres=fantasy&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_10",
    "https://www.imdb.com/search/title/?title_type=feature&genres=fantasy&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?title_type=feature&genres=fantasy&start=101&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=film-noir&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_11",
    "https://www.imdb.com/search/title/?title_type=feature&genres=film-noir&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?title_type=feature&genres=film-noir&start=101&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=history&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_12", 
    "https://www.imdb.com/search/title/?title_type=feature&genres=history&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=horror&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_13",
    "https://www.imdb.com/search/title/?title_type=feature&genres=horror&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=music&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_14",
    "https://www.imdb.com/search/title/?title_type=feature&genres=music&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=musical&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_15",
    "https://www.imdb.com/search/title/?title_type=feature&genres=musical&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?title_type=feature&genres=musical&start=101&explore=genres&ref_=adv_nxt", 
    "https://www.imdb.com/search/title/?title_type=feature&genres=musical&start=151&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=mystery&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_16", 
    "https://www.imdb.com/search/title/?title_type=feature&genres=mystery&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?title_type=feature&genres=mystery&start=101&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=romance&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_17", 
    "https://www.imdb.com/search/title/?genres=sci-fi&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_18",
    "https://www.imdb.com/search/title/?title_type=feature&genres=sci-fi&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?title_type=feature&genres=sci-fi&start=101&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=sport&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_20",
    "https://www.imdb.com/search/title/?title_type=feature&genres=sport&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?title_type=feature&genres=sport&start=101&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=thriller&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_22",
    "https://www.imdb.com/search/title/?genres=war&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_23",
    "https://www.imdb.com/search/title/?title_type=feature&genres=war&start=51&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?title_type=feature&genres=war&start=101&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?title_type=feature&genres=war&start=151&explore=genres&ref_=adv_nxt",
    "https://www.imdb.com/search/title/?genres=western&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=WG06R93M90RYQS1FJYVC&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_24", 
    "https://www.imdb.com/search/title/?title_type=feature&genres=western&start=51&explore=genres&ref_=adv_nxt",
]

for url in url_list:
    fetchData(url)

In [121]:
movie_genre_dict

{'Action': 363,
 'Adventure': 405,
 'Fantasy': 127,
 'Thriller': 139,
 'Crime': 217,
 'Drama': 806,
 'Sci-Fi': 106,
 'Comedy': 391,
 'History': 112,
 'Horror': 121,
 'Animation': 158,
 'Mystery': 137,
 'Biography': 155,
 'Family': 112,
 'War': 81,
 'Music': 85,
 'Romance': 162,
 'Sport': 94,
 'Western': 59,
 'Musical': 78,
 'Film-Noir': 137}

In [122]:
len(movie_synopsis_list)

1431

## Convert to CSV

In [123]:
# Putting it together as pandas dataframe
movie_df = pd.DataFrame({
    "title": movie_title_list,
    "genre": movie_genre_list,
    "synopsis": movie_synopsis_list
})

# Downloading dataframe as a csv file
movie_df.to_csv("movies.csv", index=False)