# Dataset Creation
## Netflix Original Movies 
### Imdb website

In [1]:
# imports 
from bs4 import BeautifulSoup as bs
import requests as req
import pandas as pd

In [2]:
base_url = "https://www.imdb.com/list/ls043455037/?sort=list_order,asc&st_dt=&mode=detail&page={}"
base_url_1 = base_url.format(1)
base_url_2 = base_url.format(2)
base_url_3 = base_url.format(3)
base_url_4 = base_url.format(4)

In [16]:
web_page_content = req.get(base_url_1,headers={"User-Agent": "Requests"}).content
# convert into beautifulsoup object
bsoup = bs(web_page_content)
# important content 
card_blocks = bsoup.find_all('div',{'class':'lister-item-content'})


In [3]:
def get_movie_category(p):
    certificate = ''
    runtime = ''
    genre = ''
    if p.find('span',{'class':'certificate'}):
        certificate = p.find('span',{'class':'certificate'}).text.strip()
    if p.find('span',{'class':'runtime'}):
        runtime = p.find('span',{'class':'runtime'}).text.strip()
    if p.find('span',{'class':'genre'}):
        genre = p.find('span',{'class':'genre'}).text.strip()
    return (certificate,runtime,genre)

In [65]:
movies = []
for card in card_blocks:
    movies.append(get_movie_info(card))

In [6]:
# Scrape an one movie information and put into dictionary
def get_movie_info(card):
    movie ={}
    if card.find('h3'):
        movie_name = card.h3.text.split('.')
        movie["Movie Name"] = movie_name[1]
    if card.find('p'):
        for i,e in enumerate(get_movie_category(card.find('p'))):
            if i==0: movie['Certificate'] = e
            elif i==1: movie['RunTime'] = e  
            elif i==2: movie['Genre'] = e 
    if card.find('div',{'class':'ipl-rating-widget'}):
        rating = card.find('div',{'class':'ipl-rating-widget'}).find('span',{'class':'ipl-rating-star__rating'}).text
        movie['Rating'] = rating
    if card.find('p',{'class':'text-muted text-small'}):
        paragraphs = card.find_all('p',{'class':'text-muted text-small'})
        for p in paragraphs:
            if p.find('span',{'name':'nv'}):
                votes = p.find('span',{'name':'nv'}).text
                movie["Votes"] = votes
            elif p.find('a'):
                anchors = p.find_all('a')
                stars =[]
                for i,a in enumerate(anchors):
                    if i==0:
                        movie["Director"] = a.text
                    else : stars.append(a.text)
                movie["Stars"] = stars
    return movie

## Generalized function that takes page url and return array of Netflix Movies

In [5]:
# Scrape all movies from web page by passing url to the function
def get_all_movies_from_web_page(url,movies):
    web_page_content = req.get(url,headers={"User-Agent": "Requests"}).content
    # convert into beautifulsoup object
    bsoup = bs(web_page_content)
    # important content 
    card_blocks = bsoup.find_all('div',{'class':'lister-item-content'})
    for card in card_blocks:
        movies.append(get_movie_info(card))

In [7]:
movies = []
get_all_movies_from_web_page(base_url_1,movies)
get_all_movies_from_web_page(base_url_2,movies)
get_all_movies_from_web_page(base_url_3,movies)
get_all_movies_from_web_page(base_url_4,movies)
movies[:3]

[{'Movie Name': '\nBeasts of No Nation\n(2015)\n',
  'Certificate': '',
  'RunTime': '137 min',
  'Genre': 'Drama, War',
  'Rating': '7.7',
  'Director': 'Cary Joji Fukunaga',
  'Stars': ['Abraham Attah',
   'Emmanuel Affadzi',
   'Ricky Adelayitor',
   'Andrew Adote'],
  'Votes': '73,983'},
 {'Movie Name': '\nCrouching Tiger, Hidden Dragon: Sword of Destiny\n(2016)\n',
  'Certificate': 'PG-13',
  'RunTime': '96 min',
  'Genre': 'Action, Adventure, Drama',
  'Rating': '6.1',
  'Director': 'Woo-Ping Yuen',
  'Stars': ['Donnie Yen',
   'Michelle Yeoh',
   'Harry Shum Jr.',
   'Natasha Liu Bordizzo'],
  'Votes': '17,882'},
 {'Movie Name': '\nThe Fundamentals of Caring\n(2016)\n',
  'Certificate': '16',
  'RunTime': '97 min',
  'Genre': 'Comedy, Drama',
  'Rating': '7.3',
  'Director': 'Rob Burnett',
  'Stars': ['Craig Roberts', 'Paul Rudd', 'Selena Gomez', 'Alex Huff'],
  'Votes': '60,034'}]

In [9]:
# how many movies scraped
len(movies)

357

In [10]:
# movie information labels
object_keys = list(movies[0].keys())
object_keys

['Movie Name',
 'Certificate',
 'RunTime',
 'Genre',
 'Rating',
 'Director',
 'Stars',
 'Votes']

In [12]:
# Lets format our data to objects to lists for feeding into pandas
def format_data(movies,keys):
    movies_name = []
    certificates =[]
    run_times = []
    genres = []
    ratings = []
    directors = []
    stars =[]
    votes =[]
    for movie in movies:
        for key in keys:
            if key == 'Stars' :
                if key in movie.keys():
                    stars_string = ""
                    for star in movie["Stars"]:
                        stars_string = stars_string + star +","
                    stars.append(stars_string)
                else : stars.append(None)
            elif key == "Movie Name":
                if key in movie.keys():
                    name_filter = movie[key].strip()
                    movies_name.append(name_filter.replace("\n"," "))
                else : movies_name.append(None)
            elif key == "Certificate":
                if key in movie.keys():
                    certificates.append(movie[key])
                else : certificates.append(None)
            elif key == "RunTime":
                if key in movie.keys():
                    run_times.append(movie[key])
                else : run_times.append(None)
            elif key == "Genre":
                if key in movie.keys():
                    genres.append(movie[key])
                else : genres.append(None)
            elif key == "Rating":
                if key in movie.keys():
                    ratings.append(movie[key])
                else : ratings.append(None)
            elif key == "Director":
                if key in movie.keys():
                    directors.append(movie[key])
                else : directors.append(None)
            elif key == "Votes":
                if key in movie.keys():
                    votes.append(movie[key])
                else : votes.append(None)
    return {"Movie Name":movies_name,"Imdb Ratings":ratings,"Genre":genres,"Run Time":run_times,"Director":directors,"Stars":stars,"Votes":votes,"Certificate":certificates}

In [13]:
# lets convert our movies into dictionaries to lists in order to convert into pandas dataframe
movies_dictionary = format_data(movies,object_keys)

In [14]:
# convert movies into pandas data frame
data_frame = pd.DataFrame(movies_dictionary)
data_frame

Unnamed: 0,Movie Name,Imdb Ratings,Genre,Run Time,Director,Stars,Votes,Certificate
0,Beasts of No Nation (2015),7.7,"Drama, War",137 min,Cary Joji Fukunaga,"Abraham Attah,Emmanuel Affadzi,Ricky Adelayito...",73983,
1,"Crouching Tiger, Hidden Dragon: Sword of Desti...",6.1,"Action, Adventure, Drama",96 min,Woo-Ping Yuen,"Donnie Yen,Michelle Yeoh,Harry Shum Jr.,Natash...",17882,PG-13
2,The Fundamentals of Caring (2016),7.3,"Comedy, Drama",97 min,Rob Burnett,"Craig Roberts,Paul Rudd,Selena Gomez,Alex Huff,",60034,16
3,Rebirth (I) (2016),5,Thriller,100 min,Karl Mueller,"Fran Kranz,Adam Goldberg,Nicky Whelan,Kat Foster,",4444,
4,Tallulah (2016),6.7,"Comedy, Drama, Romance",111 min,Sian Heder,"Elliot Page,Allison Janney,Tammy Blanchard,Eva...",18309,
...,...,...,...,...,...,...,...,...
352,Sahara (2017),5.6,"Animation, Adventure, Comedy",86 min,Pierre Coré,"Omar Sy,Louane Emera,Franck Gastambide,Vincent...",2448,
353,Croc-Blanc (2018),6.9,"Animation, Adventure",85 min,Alexandre Espigares,"Raphaël Personnaz,Virginie Efira,Dominique Pin...",3151,
354,An Afternoon with SCTV (TV Special),,Talk-Show,,Martin Scorsese,"Catherine O'Hara,Eugene Levy,Rick Moranis,Mart...",,
355,Derren Brown: Miracle (2016 TV Special),6.6,Documentary,73 min,Julia Knowles,"John L. Spencer,Derren Brown,",924,


### Saving Dataset into csv file

In [15]:
# convert our dataframe into csv file in order to create dataset
data_frame.to_csv('./data/netflix-original-movies.csv')

### Done scrape all Netflix Original movies from Imdb website