In [None]:
#web scrapper(start)
#importing necessary libraries
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

#to get all the movies in different languuage to english
headers = {"Accept-Language": "en-US, en;q=0.5"}

#Initialize list to store data
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []
director = []

#creating list to iterate through pages
start = ['0','101','201','301','401','501','601','701','801','901']

for start_itr in start:
    #requesting URL contents
    response = get('https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start='+start_itr+'&ref_=adv_prv',headers = headers)
    
    #reading the content using beautiful soup
    soup = BeautifulSoup(response.text,"html.parser")
    
    #storing all div containers
    movie_div = soup.find_all('div', class_='lister-item mode-advanced')
    
    #looping through all div containers to get useful data
    for container in movie_div:
        #Fetching titles
        name = container.h3.a.text
        titles.append(name)
        
        #Fetching year
        year = container.h3.find('span', class_='lister-item-year').text
        years.append(year)
        
        #fetching duration
        length = container.find('span',class_='runtime').text if container.p.find('span', class_='runtime').text else '-'
        time.append(length)
        
        #fetching ratings
        ratings = float(container.strong.text)
        imdb_ratings.append(ratings)
        
        #fetching metascore
        score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
        metascores.append(score)
        
        #fetching votes and earnings
        nv = container.find_all('span',attrs={'name':'nv'})
        vote = nv[0].text
        votes.append(vote)
        grosses = nv[1].text if len(nv)>1 else '-'
        us_gross.append(grosses)
        
        #fetching directors name
        dirName = container.find('p',class_='').a.text
        director.append(dirName)
#Web scrapper complete(end)

#Creating a dataframe
moviesDf = pd.DataFrame({
    'movie':titles,
    'year':years,
    'timeMin':time,
    'imdb':imdb_ratings,
    'metascore':metascores,
    'votes':votes,
    'us_grossMillions':us_gross,
    'director':director
})

#data cleaning start
#checking for duplicate rows
duplicateRowsDF = moviesDf[moviesDf.duplicated()]

#Data cleaning to convert various object datatype to int and float
moviesDf['year'] = moviesDf['year'].str.extract('(\d+)').astype(int)
moviesDf['timeMin'] = moviesDf['timeMin'].str.extract('(\d+)').astype(int)
moviesDf['metascore'] = moviesDf['metascore'].str.extract('(\d+)')
moviesDf['metascore'] = pd.to_numeric(moviesDf['metascore'], errors='coerce')
moviesDf['votes'] = moviesDf['votes'].str.replace(',','').astype(int)
moviesDf['us_grossMillions'] = moviesDf['us_grossMillions'].map(lambda x: x.lstrip('$').rstrip('M')) 
moviesDf['us_grossMillions'] = pd.to_numeric(moviesDf['us_grossMillions'], errors='coerce')

#some insights
#top 10 movies by imdb ratings
imdb_top10 = moviesDf.head(10)

#top 10 movies by metascores
metascoreDf = moviesDf.sort_values('metascore',ascending=False)
metascoreDf_top10 = metascoreDf.head(10)

#top 10 movies by earnings
us_grossMillionsDf = moviesDf.sort_values('us_grossMillions',ascending=False)
us_grossMillionsDf_top10 = us_grossMillionsDf.head(10)

#top 10 movies by votes
votesDf = moviesDf.sort_values('votes',ascending=False)
votesDf_top10 = votesDf.head(10)

#Number of movies made by director
director_movie_count = moviesDf["director"].value_counts()

#handling missing values by mean
mean_mscore = moviesDf['metascore'].mean()
moviesDf['metascore'] = moviesDf['metascore'].fillna(mean_mscore)
mean_gross = moviesDf['us_grossMillions'].mean()
moviesDf['us_grossMillions'] = moviesDf['us_grossMillions'].fillna(mean_gross)

#converting data frame to csv file
moviesDf.to_csv('top_1000_movies.csv')

#movies by directors
spielberg_movies = moviesDf[moviesDf['director'] == 'Steven Spielberg']
hitchcock_movies = moviesDf[moviesDf['director'] == 'Alfred Hitchcock']
martin_movies = moviesDf[moviesDf['director'] == 'Martin Scorsese']
stanley_movies = moviesDf[moviesDf['director'] == 'Stanley Kubrick']
akira_movies = moviesDf[moviesDf['director'] == 'Akira Kurosawa']
nolan_movies = moviesDf[moviesDf['director'] == 'Christopher Nolan']
tarantino_movies = moviesDf[moviesDf['director'] == 'Quentin Tarantino']

#dataframe for directors with most movies(top5)+2extra
directorsDf = pd.concat([spielberg_movies,hitchcock_movies,martin_movies,stanley_movies,akira_movies,nolan_movies,tarantino_movies])

#for plotting graphs
import plotly.express as px
import plotly.graph_objects as go

#sunburst for directors with most movies
fig = px.sunburst(directorsDf, path = ['director','movie'],hover_data=['movie'])
fig.show()

#finding relation between votes and years(bubble chart)
fig = px.scatter(directorsDf.query("imdb >= 8"), x="year", y="votes",size="us_grossMillions", color="director",
                 hover_name="movie", log_x=True, size_max=60)
fig.show()

#heatmap for top5+2 extra highest movie making directors
corr = directorsDf[["year", "imdb", "timeMin","metascore","votes","us_grossMillions"]].corr()
fig = go.Figure(data=go.Heatmap(
                z=corr,
                x=corr.columns,
                y=corr.columns,
                hoverongaps=True,
                colorscale="blues",
            )
        )
fig.show()

#heatmap for entire moviesdataset
corr = moviesDf[["year", "imdb", "timeMin","metascore","votes","us_grossMillions",]].corr()
fig = go.Figure(data=go.Heatmap(
                z=corr,
                x=corr.columns,
                y=corr.columns,
                hoverongaps=True,
                colorscale="blues",
            )
        )
fig.show()
