In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np

In [3]:
url = 'https://www.imdb.com/search/title/?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=f11158cc-b50b-4c4d-b0a2-40b32863395b&pf_rd_r=XZ8X52H1R40B7KG5SNZ9&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_1'

csv1 = []

csv2 = []

page = 1
while True:
    response = requests.get(url + f'&start={page}')

    soup = BeautifulSoup(response.content, 'html.parser')

    movies = soup.find_all('div', class_='lister-item mode-advanced')

    if not movies:
        break

    for count in movies:
        # Extract data for the first CSV
        title = count.h3.a.text.strip()
        directors = count.find('p', class_='').find_all('a')
        director_names = [director.text.strip() for director in directors]
        duration = count.find('span', class_='runtime').text.strip().split()[0]
        year = count.find('span', class_='lister-item-year').text.strip('()')
        rating = count.find('strong').text.strip()
        metascore = count.find('span', class_='metascore').text.strip() if count.find('span', class_='metascore') else ''

        csv1.append({
            'S.no': len(csv1) + 1,
            'Movie Name': title,
            'Director Name': ', '.join(director_names),
            'Duration': duration,
            'Year': year,
            'Ratings': rating,
            'Metascore': metascore
        })

        stars = count.find('p', class_='').find_all('a')[1:]
        star_names = [star.text.strip() for star in stars]
        votes = count.find('span', attrs={'name': 'nv'})['data-value']
        genres = count.find('span', class_='genre').text.strip().split(', ')
        gross = count.find('p', class_='sort-num_votes-visible').find_all('span')[-1]['data-value'] if count.find('p', class_='sort-num_votes-visible').find_all('span') else ''
        popularity = count.find('div', class_='lister-item-content').find('div', class_='inline-block ratings-imdb-rating')['data-value']
        certification = count.find('span', class_='certificate').text.strip() if count.find('span', class_='certificate') else ''

        csv2.append({
            'Movie Name': title,
            'Stars': ', '.join(star_names),
            'Votes': votes,
            'Genre': ', '.join(genres),
            'Gross collection': gross,
            'Popularity': popularity,
            'Certification': certification
        })

    page += 50

In [4]:
df1 = pd.DataFrame(csv1)

df2 = pd.DataFrame(csv2)

In [5]:
df1.shape

(1775, 7)

In [6]:
df2.shape

(1775, 7)

Bifurcating the Directors column

In [7]:
directors1 = df1['Director Name'].str.split(', ', expand=True)

directors_cnt = directors1.apply(lambda row: len(row.dropna()), axis=1).max()
directors_cnt

director_col = [f'Director{i}' for i in range(1, directors_cnt + 1)]
df1[director_col] = directors1.iloc[:, :directors_cnt]

In [8]:
df1.drop("Director Name", axis=1, inplace=True)

In [9]:
df1.head()

Unnamed: 0,S.no,Movie Name,Duration,Year,Ratings,Metascore,Director1,Director2,Director3,Director4,Director5,Director6,Director7,Director8,Director9,Director10
0,1,The Dark Knight,152,2008,9.0,84,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,,,,,
1,2,The Lord of the Rings: The Return of the King,201,2003,9.0,94,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,,,,,
2,3,Spider-Man: Across the Spider-Verse,140,2023,8.8,86,Joaquim Dos Santos,Kemp Powers,Justin K. Thompson,Shameik Moore,Hailee Steinfeld,Brian Tyree Henry,Luna Lauren Velez,,,
3,4,Inception,148,2010,8.8,74,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,,,,,
4,5,The Lord of the Rings: The Fellowship of the Ring,178,2001,8.8,92,Peter Jackson,Elijah Wood,Ian McKellen,Orlando Bloom,Sean Bean,,,,,


Bifurcating the Stars & Genre columns

In [10]:
stars1 = df2['Stars'].str.split(', ', expand=True)

stars_cnt = stars1.apply(lambda row: len(row.dropna()), axis=1).max()

star_col = [f'Star{i}' for i in range(1, stars_cnt + 1)]
df2[star_col] = stars1.iloc[:, :stars_cnt]

genre1 = df2['Genre'].str.split(', ', expand=True)

genre_cnt = genre1.apply(lambda row: len(row.dropna()), axis=1).max()

# Create new columns for stars
genre_col = [f'Genre{i}' for i in range(1, genre_cnt + 1)]
df2[genre_col] = genre1.iloc[:, :genre_cnt]

In [11]:
df2.drop(["Stars", "Genre"], axis=1, inplace=True)

In [12]:
df2.head()

Unnamed: 0,Movie Name,Votes,Gross collection,Popularity,Certification,Star1,Star2,Star3,Star4,Star5,Star6,Star7,Star8,Star9,Genre1,Genre2,Genre3
0,The Dark Knight,2759160,534858444,9.0,UA,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,,,,,,Action,Crime,Drama
1,The Lord of the Rings: The Return of the King,1906339,377845905,9.0,U,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,,,,,,Action,Adventure,Drama
2,Spider-Man: Across the Spider-Verse,208178,208178,8.8,U,Kemp Powers,Justin K. Thompson,Shameik Moore,Hailee Steinfeld,Brian Tyree Henry,Luna Lauren Velez,,,,Animation,Action,Adventure
3,Inception,2449482,292576195,8.8,UA,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,,,,,,Action,Adventure,Sci-Fi
4,The Lord of the Rings: The Fellowship of the Ring,1934546,315544750,8.8,U,Elijah Wood,Ian McKellen,Orlando Bloom,Sean Bean,,,,,,Action,Adventure,Drama


Transfering data into csvs

In [13]:
df1.head()

Unnamed: 0,S.no,Movie Name,Duration,Year,Ratings,Metascore,Director1,Director2,Director3,Director4,Director5,Director6,Director7,Director8,Director9,Director10
0,1,The Dark Knight,152,2008,9.0,84,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,,,,,
1,2,The Lord of the Rings: The Return of the King,201,2003,9.0,94,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,,,,,
2,3,Spider-Man: Across the Spider-Verse,140,2023,8.8,86,Joaquim Dos Santos,Kemp Powers,Justin K. Thompson,Shameik Moore,Hailee Steinfeld,Brian Tyree Henry,Luna Lauren Velez,,,
3,4,Inception,148,2010,8.8,74,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,,,,,
4,5,The Lord of the Rings: The Fellowship of the Ring,178,2001,8.8,92,Peter Jackson,Elijah Wood,Ian McKellen,Orlando Bloom,Sean Bean,,,,,


In [14]:
df2.head()

Unnamed: 0,Movie Name,Votes,Gross collection,Popularity,Certification,Star1,Star2,Star3,Star4,Star5,Star6,Star7,Star8,Star9,Genre1,Genre2,Genre3
0,The Dark Knight,2759160,534858444,9.0,UA,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,,,,,,Action,Crime,Drama
1,The Lord of the Rings: The Return of the King,1906339,377845905,9.0,U,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,,,,,,Action,Adventure,Drama
2,Spider-Man: Across the Spider-Verse,208178,208178,8.8,U,Kemp Powers,Justin K. Thompson,Shameik Moore,Hailee Steinfeld,Brian Tyree Henry,Luna Lauren Velez,,,,Animation,Action,Adventure
3,Inception,2449482,292576195,8.8,UA,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,,,,,,Action,Adventure,Sci-Fi
4,The Lord of the Rings: The Fellowship of the Ring,1934546,315544750,8.8,U,Elijah Wood,Ian McKellen,Orlando Bloom,Sean Bean,,,,,,Action,Adventure,Drama


In [15]:
df1.to_csv("imdb_data1.csv")
df2.to_csv("imdb_data2.csv")