# Dataset Creation
## Disney movies
### Source : Imdb website

In [1]:
# imports 
from bs4 import BeautifulSoup as bs
import requests as req
import pandas as pd

In [2]:
base_url = "https://www.imdb.com/list/ls068561553/?sort=list_order,asc&st_dt=&mode=detail&page={}"
url_1 = base_url.format(1)
url_2 = base_url.format(2)
url_3 = base_url.format(3)
url_4 = base_url.format(4)
url_5 = base_url.format(5)
url_6 = base_url.format(6)

In [4]:
web_page_content = req.get(url_1,headers={"User-Agent": "Requests"}).content
# convert into beautifulsoup object
bsoup = bs(web_page_content)
# important content 
card_blocks = bsoup.find_all('div',{'class':'lister-item-content'})

In [7]:
# for certificate runtime and genre extraction
def get_movie_category(p):
    certificate = ''
    runtime = ''
    genre = ''
    if p.find('span',{'class':'certificate'}):
        certificate = p.find('span',{'class':'certificate'}).text.strip()
    if p.find('span',{'class':'runtime'}):
        runtime = p.find('span',{'class':'runtime'}).text.strip()
    if p.find('span',{'class':'genre'}):
        genre = p.find('span',{'class':'genre'}).text.strip()
    return (certificate,runtime,genre)

In [47]:
# Scrape all movie information from given card
def get_movie_info(card):
    movie ={}
    if card.find('h3'):
        movie_name = card.h3.text.split('.')
        movie["Movie Name"] = movie_name[1]
    if card.find('p'):
        for i,e in enumerate(get_movie_category(card.find('p'))):
            if i==0: movie['Certificate'] = e
            elif i==1: movie['RunTime'] = e  
            elif i==2: movie['Genre'] = e 
    if card.find('div',{'class':'ipl-rating-widget'}):
        rating = card.find('div',{'class':'ipl-rating-widget'}).find('span',{'class':'ipl-rating-star__rating'}).text
        movie['Rating'] = rating
    if card.find('p',{'class':'text-muted text-small'}):
        paragraphs = card.find_all('p',{'class':'text-muted text-small'})
        for p in paragraphs:
            if p.find('span',{'name':'nv'}):
                votes =''
                votes = p.find('span',{'name':'nv'}).text
                movie["Votes"] = votes
            elif p.find('a'):
                anchors = p.find_all('a')
                stars =[]
                for i,a in enumerate(anchors):
                    if i==0:
                        movie["Director"] = a.text
                    else : stars.append(a.text)
                movie["Stars"] = stars
    return movie

In [12]:
# Scrape all movies from given page by passing page of url
def get_all_movies_from_web_page(url,movies):
    web_page_content = req.get(url,headers={"User-Agent": "Requests"}).content
    # convert into beautifulsoup object
    bsoup = bs(web_page_content)
    # important content 
    card_blocks = bsoup.find_all('div',{'class':'lister-item-content'})
    for card in card_blocks:
        movies.append(get_movie_info(card))

In [48]:
# Lets pass all the souce url collected from imdb website to the above function i.e. get_all_movies_from_web_page
movies = []
# this urls contains all disney movies page 1 to page 6
get_all_movies_from_web_page(url_1,movies)
get_all_movies_from_web_page(url_2,movies)
get_all_movies_from_web_page(url_3,movies)
get_all_movies_from_web_page(url_4,movies)
get_all_movies_from_web_page(url_5,movies)
get_all_movies_from_web_page(url_6,movies)
movies[:3]

[{'Movie Name': '\nSnow White and the Seven Dwarfs\n(1937)\n',
  'Certificate': 'U',
  'RunTime': '83 min',
  'Genre': 'Animation, Family, Fantasy',
  'Rating': '7.6',
  'Director': 'William Cottrell',
  'Stars': ['David Hand',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen',
   'Adriana Caselotti',
   'Harry Stockwell',
   'Lucille La Verne',
   'Roy Atwell'],
  'Votes': '181,664'},
 {'Movie Name': '\nPinocchio\n(1940)\n',
  'Certificate': 'G',
  'RunTime': '88 min',
  'Genre': 'Animation, Comedy, Family',
  'Rating': '7.4',
  'Director': 'Norman Ferguson',
  'Stars': ['T. Hee',
   'Wilfred Jackson',
   'Jack Kinney',
   'Hamilton Luske',
   'Bill Roberts',
   'Ben Sharpsteen',
   'Dickie Jones',
   'Christian Rub',
   'Mel Blanc',
   'Billy Bletcher'],
  'Votes': '131,447'},
 {'Movie Name': '\nFantasia\n(1940)\n',
  'Certificate': '',
  'RunTime': '125 min',
  'Genre': 'Animation, Family, Fantasy',
  'Rating': '7.7',
  'Director': 'James Algar',
  'Star

In [33]:
len(movies)

542

In [15]:
object_keys = list(movies[0].keys())
object_keys

['Movie Name',
 'Certificate',
 'RunTime',
 'Genre',
 'Rating',
 'Director',
 'Stars',
 'Votes']

In [49]:
# Lets format our data to objects to lists for feeding into pandas
def format_data(movies,keys):
    movies_name = []
    certificates =[]
    run_times = []
    genres = []
    ratings = []
    directors = []
    stars =[]
    votes =[]
    for movie in movies:
        for key in keys:
            if key == 'Stars' :
                if key in movie.keys():
                    stars_string = ""
                    for star in movie["Stars"]:
                        stars_string = stars_string + star +","
                    stars.append(stars_string)
                else : stars.append(None)
            elif key == "Movie Name":
                if key in movie.keys():
                    name_filter = movie[key].strip()
                    movies_name.append(name_filter.replace("\n"," "))
                else : movies_name.append(None)
            elif key == "Certificate":
                if key in movie.keys():
                    certificates.append(movie[key])
                else : certificates.append(None)
            elif key == "RunTime":
                if key in movie.keys():
                    run_times.append(movie[key])
                else : run_times.append(None)
            elif key == "Genre":
                if key in movie.keys():
                    genres.append(movie[key])
                else : genres.append(None)
            elif key == "Rating":
                if key in movie.keys():
                    ratings.append(movie[key])
                else : ratings.append(None)
            elif key == "Director":
                if key in movie.keys():
                    directors.append(movie[key])
                else : directors.append(None)
            elif key == "Votes":
                if key in movie.keys():
                    votes.append(movie[key])
                else : votes.append(None)
    return {"Movie Name":movies_name,"Imdb Ratings":ratings,"Genre":genres,"Run Time":run_times,"Director":directors,"Stars":stars,"Votes":votes,"Certificate":certificates}

In [50]:
movies_dictionary = format_data(movies,object_keys)

In [54]:
data_frame = pd.DataFrame(movies_dictionary)
data_frame

Unnamed: 0,Movie Name,Imdb Ratings,Genre,Run Time,Director,Stars,Votes,Certificate
0,Snow White and the Seven Dwarfs (1937),7.6,"Animation, Family, Fantasy",83 min,William Cottrell,"David Hand,Wilfred Jackson,Larry Morey,Perce P...",181664,U
1,Pinocchio (1940),7.4,"Animation, Comedy, Family",88 min,Norman Ferguson,"T. Hee,Wilfred Jackson,Jack Kinney,Hamilton Lu...",131447,G
2,Fantasia (1940),7.7,"Animation, Family, Fantasy",125 min,James Algar,"Samuel Armstrong,Ford Beebe Jr.,Norman Ferguso...",88692,
3,The Reluctant Dragon (I) (1941),6.9,"Animation, Comedy, Family",74 min,Alfred L. Werker,"Hamilton Luske,Jack Cutting,Ub Iwerks,Jack Kin...",2791,
4,Dumbo (1941),7.2,"Animation, Drama, Family",64 min,Samuel Armstrong,"Norman Ferguson,Wilfred Jackson,Jack Kinney,Bi...",119841,G
...,...,...,...,...,...,...,...,...
537,Coco (I) (2017),8.4,"Animation, Adventure, Family",105 min,Lee Unkrich,"Adrian Molina,Anthony Gonzalez,Gael García Ber...",384649,U
538,Pirates of the Caribbean: Dead Men Tell No Tal...,6.5,"Action, Adventure, Fantasy",129 min,Joachim Rønning,"Espen Sandberg,Johnny Depp,Geoffrey Rush,Javie...",263960,UA
539,Descendants 2 (2017 TV Movie),6.5,"Action, Adventure, Family",111 min,Kenny Ortega,"Dove Cameron,Cameron Boyce,Sofia Carson,Booboo...",8544,
540,Beauty and the Beast (I) (2017),7.1,"Family, Fantasy, Musical",129 min,Bill Condon,"Emma Watson,Dan Stevens,Luke Evans,Josh Gad,",274633,UA


In [56]:
data_frame.to_csv('./data/Disney-Movies.csv')

### Done Scape All 542 Movies and store into Disney-movies.csv