# Preparing Data Set IMDB

Analysis of Movies released in the cinema between 2000 and 2019 using IMDB DATA


Analyze the data available on the IMDb website for movies released in the cinema between 2000 and 2019.

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [2]:
url = "https://www.imdb.com/search/title/?release_date=2019&sort=num_votes,desc&page=1"

In [3]:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
movie_containers = soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


In [4]:
names = []
genres = []
years = []
imdb_ratings = []
metascores = []
votes = []
grossmill = []
runtimes = []


start_time = time()
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2020)]


# For every year in the interval 2000-2017
for year_url in years_url:

    # For every page in the interval 1-4
    for page in pages:

        # Make a get request
        response = requests.get('http://www.imdb.com/search/title?release_date=' + year_url +
        '&sort=num_votes,desc&page=' + page)

        

        

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'lxml')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

        # For every movie of these 50
        for container in mv_containers:
            # If the movie has a Metascore, then:
            if container.find('div', class_ = 'ratings-metascore') is not None:

                # Scrape the name
                name = container.h3.a.text
                names.append(name)
                
                # Scrape the genre
                genre = container.p.find('span', class_ = 'genre').text.rstrip().replace("\n","").split(",")
                genres.append(genre)
                
                
                # Scrape the runtime
                runtime = container.p.find('span', class_ = 'runtime').text
                runtimes.append(runtime)
            
                
                # Scrape the year
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)
                
                # Scrape the IMDB rating
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)

                # Scrape the Metascore
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))

                # Scrape the number of votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))
                
                # Scrape the GrossMill
                gross = gross = container.find('span', text='Gross:')
                if gross:
                    gross = int(gross.find_next('span')['data-value'].replace(',', ''))
                    
                grossmill.append(gross)
                
                
                
                

In [5]:
movie_ratings = pd.DataFrame({'movie': names,
                              'year': years,
                              'imdb': imdb_ratings,
                              'metascore': metascores,
                              'genre' : genres,
                              'runtime': runtimes,
                              'gross' : grossmill,
                              'votes': votes
                              })
print(movie_ratings.info())
movie_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3616 entries, 0 to 3615
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movie      3616 non-null   object 
 1   year       3616 non-null   object 
 2   imdb       3616 non-null   float64
 3   metascore  3616 non-null   int64  
 4   genre      3616 non-null   object 
 5   runtime    3616 non-null   object 
 6   gross      3536 non-null   float64
 7   votes      3616 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 226.1+ KB
None


Unnamed: 0,movie,year,imdb,metascore,genre,runtime,gross,votes
0,Gladiator,(2000),8.5,67,"[Action, Adventure, Drama]",155 min,187705427.0,1265139
1,Memento,(2000),8.4,80,"[Mystery, Thriller]",113 min,25544867.0,1065249
2,Snatch,(2000),8.3,55,"[Comedy, Crime]",104 min,30328156.0,744029
3,Requiem for a Dream,(2000),8.3,68,[Drama],102 min,3635482.0,725875
4,X-Men,(2000),7.4,64,"[Action, Adventure, Sci-Fi]",104 min,157299717.0,550968
5,Cast Away,(2000),7.8,73,"[Adventure, Drama, Romance]",143 min,233632142.0,491129
6,American Psycho,(2000),7.6,64,"[Comedy, Crime, Drama]",101 min,15070285.0,453294
7,Unbreakable,(2000),7.3,62,"[Drama, Mystery, Sci-Fi]",106 min,95011339.0,370098
8,Mission: Impossible II,(2000),6.1,59,"[Action, Adventure, Thriller]",123 min,215409889.0,299974
9,Meet the Parents,(2000),7.0,73,"[Comedy, Romance]",108 min,166244045.0,299610


In [6]:
movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes', 'genre', 'runtime', 'gross']]
movie_ratings.head()

Unnamed: 0,movie,year,imdb,metascore,votes,genre,runtime,gross
0,Gladiator,(2000),8.5,67,1265139,"[Action, Adventure, Drama]",155 min,187705427.0
1,Memento,(2000),8.4,80,1065249,"[Mystery, Thriller]",113 min,25544867.0
2,Snatch,(2000),8.3,55,744029,"[Comedy, Crime]",104 min,30328156.0
3,Requiem for a Dream,(2000),8.3,68,725875,[Drama],102 min,3635482.0
4,X-Men,(2000),7.4,64,550968,"[Action, Adventure, Sci-Fi]",104 min,157299717.0


In [7]:
movie_ratings['year'].unique()

array(['(2000)', '(I) (2000)', '(2001)', '(2002)', '(2003)', '(2004)',
       '(I) (2004)', '(2005)', '(I) (2005)', '(2006)', '(I) (2006)',
       '(2007)', '(I) (2007)', '(2008)', '(I) (2008)', '(2009)',
       '(I) (2009)', '(2010)', '(I) (2010)', '(2011)', '(I) (2011)',
       '(2012)', '(I) (2012)', '(2013)', '(I) (2013)', '(2014)',
       '(I) (2014)', '(II) (2014)', '(2015)', '(I) (2015)', '(II) (2015)',
       '(2016)', '(II) (2016)', '(I) (2016)', '(IX) (2016)', '(2017)',
       '(I) (2017)', '(2018)', '(I) (2018)', '(III) (2018)', '(2019)',
       '(II) (2019)', '(I) (2019)'], dtype=object)

In [8]:
movie_ratings.loc[:, 'year'] = movie_ratings['year'].str[-5:-1].astype(int)

In [9]:
movie_ratings['year'].head(3)

0    2000
1    2000
2    2000
Name: year, dtype: int32

In [10]:
movie_ratings['genre'].head(3)

0    [Action,  Adventure,  Drama]
1            [Mystery,  Thriller]
2                [Comedy,  Crime]
Name: genre, dtype: object

In [11]:
movie_ratings.describe().loc[['min', 'max'], ['imdb', 'metascore']]

Unnamed: 0,imdb,metascore
min,4.1,24.0
max,9.0,100.0


In [12]:
movie_ratings['n_imdb'] = movie_ratings['imdb'] * 10
movie_ratings.head(3)

Unnamed: 0,movie,year,imdb,metascore,votes,genre,runtime,gross,n_imdb
0,Gladiator,2000,8.5,67,1265139,"[Action, Adventure, Drama]",155 min,187705427.0,85.0
1,Memento,2000,8.4,80,1065249,"[Mystery, Thriller]",113 min,25544867.0,84.0
2,Snatch,2000,8.3,55,744029,"[Comedy, Crime]",104 min,30328156.0,83.0


In [14]:
movie_ratings.to_csv('df_movie_ratings.csv')

In [15]:
print("There are 3616 data set")

There are 3616 data set
