## importing core libraries

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

## making a response to the target url,we use 'User agent' in orde to bypass the forbidden error

In [2]:
url = 'https://www.imdb.com/list/ls048276758/'
headers = {
    'User-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
time.sleep(20)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text,'html.parser')

## extracting the names of the movie

In [3]:
time.sleep(10)
names = soup.find_all('h3',class_ = 'ipc-title__text')

In [4]:
movie_names = list()
for name in names:
    movie_names.append(name.text)

In [5]:
len(movie_names)

27

## scraping the overview of the movie

In [6]:
descriptions = soup.find_all('div',class_ = 'ipc-html-content-inner-div')

In [7]:
movie_description = list()
for dec in descriptions:
    movie_description.append(dec.text)

In [8]:
movie_description[-1]

'When the police in a German city are unable to catch a child-murderer, other criminals join in the manhunt.'

In [9]:
movie_description = movie_description[1:]
movie_description[0]

'Don Vito Corleone, head of a mafia family, decides to hand over his empire to his youngest son, Michael. However, his decision unintentionally puts the lives of his loved ones in grave danger.'

## scraping the duration and year

In [10]:
contents = soup.find_all('span',class_='sc-b189961a-8 kLaxqf dli-title-metadata-item')

In [11]:
movie_contents = []
for cn in contents:
    movie_contents.append(cn.text)

In [12]:
movie_contents[:10]

['1972',
 '2h 55m',
 'R',
 '1953',
 '2h 16m',
 'Not Rated',
 '1954',
 '3h 27m',
 'Not Rated',
 '1974']

In [13]:
movie_year = [movie_contents[x] for x in range(0,len(movie_contents),3)]

In [14]:
movie_year[:10]

['1972',
 '1953',
 '1954',
 '1974',
 '1942',
 '1941',
 '1962',
 '1957',
 '1950',
 '1985']

In [15]:
movie_duration = [movie_contents[x] for x in range(1,len(movie_contents),3)]

In [16]:
movie_duration[:5]

['2h 55m', '2h 16m', '3h 27m', '3h 22m', '1h 42m']

## extracting the ratings

In [17]:
ratings = soup.find_all('span',class_ = "ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating")

In [18]:
movie_ratings = []
for rat in ratings:
    movie_ratings.append(rat.text.split("\xa0")[0])

In [19]:
movie_ratings[:5]

['9.2', '8.1', '8.6', '9.0', '8.5']

## extracting the urls for the poster of movies

In [20]:
imageURLs = soup.find_all('img',class_ = 'ipc-image')

In [21]:
movie_poster_url = list()
for url in imageURLs:
    movie_poster_url.append(url['src'])

## creating a dataframe

In [22]:
movie_data = list(zip(movie_names, movie_description, movie_poster_url, movie_year, movie_duration, movie_ratings))
columns = ['Name','Overview','Poster_imG_url','Year','Duration','Ratings']
df = pd.DataFrame(movie_data,columns=columns)

In [23]:
df.head()

Unnamed: 0,Name,Overview,Poster_imG_url,Year,Duration,Ratings
0,1. The Godfather,"Don Vito Corleone, head of a mafia family, dec...",https://m.media-amazon.com/images/M/MV5BM2MyNj...,1972,2h 55m,9.2
1,2. Tôkyô monogatari,An old couple visit their children and grandch...,https://m.media-amazon.com/images/M/MV5BM2E1Zm...,1953,2h 16m,8.1
2,3. Shichinin no samurai,Farmers from a village exploited by bandits hi...,https://m.media-amazon.com/images/M/MV5BNTkwY2...,1954,3h 27m,8.6
3,4. The Godfather Part II,The early life and career of Vito Corleone in ...,https://m.media-amazon.com/images/M/MV5BMWMwMG...,1974,3h 22m,9.0
4,5. Casablanca,A cynical expatriate American cafe owner strug...,https://m.media-amazon.com/images/M/MV5BY2IzZG...,1942,1h 42m,8.5


## saving data

In [24]:
df.to_csv('IMDB_MOVIES.csv')