# Web scraping 

In [1]:
# https://www.dataquest.io/blog/web-scraping-beautifulsoup/
from requests import get
url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)
print(response.text[:500])




<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle"


In [2]:
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
html_soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Released between 2017-01-01 and 2017-12-31
(Sorted by Number of Votes Descending) - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1

In [3]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


In [4]:
first_movie = html_soup('div', class_ = 'lister-item mode-advanced')[0]
first_movie.h3.a.get_text()

'Logan'

In [5]:
first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold')
first_year.get_text()

'(2017)'

In [6]:
first_rating = float(first_movie.strong.text)
first_rating

8.1

In [7]:
first_meta_score = first_movie.find('span', class_ = 'metascore favorable')
first_meta_score = float(first_meta_score.text)
first_meta_score

77.0

In [8]:
first_votes = first_movie.find('span', attrs = {'name':'nv'})
first_votes = float(first_votes['data-value'])
first_votes

609971.0

In [9]:
first_genre = first_movie.find('span', class_ = 'genre')
first_genre = first_genre.text
first_genre

'\nAction, Drama, Sci-Fi            '

In [10]:
first_Director_Stars = first_movie.find_all("p")
first_Director_Stars = first_Director_Stars[2]
first_Director_Stars = first_Director_Stars.find_all("a")
stars = []
for i in range(len(first_Director_Stars)):
    stars.append(first_Director_Stars[i].get_text())

stars

['James Mangold',
 'Hugh Jackman',
 'Patrick Stewart',
 'Dafne Keen',
 'Boyd Holbrook']

# Combining all of the above methods inside a single for loop 

In [11]:
# Lists to store the scraped data in

names = []
years = []
imdb_ratings = []
metascores = []
votes = []
genre = []
Dstars = []

# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:
        stars = []
# The name
        name = container.h3.a.text
        names.append(name)
# The year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)
# The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
# The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))
# The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))
# Genres
        genres = container.find('span', class_ = 'genre').text
        genres = genres.replace("\n","")
        genre.append(genres)
# Director and stars
        first_Director_Stars = container.find_all("p")
        first_Director_Stars = first_Director_Stars[2]
        first_Director_Stars = first_Director_Stars.find_all("a")
        for i in range(len(first_Director_Stars)):
            stars.append(first_Director_Stars[i].get_text())
        Dstars.append(stars)
            

# Making a dataframe using the data we got in the previous step

In [12]:
import pandas as pd
test_df = pd.DataFrame({'Movie': names,
'Year': years,
'Rating': imdb_ratings,
'Metascore': metascores,
'Votes': votes,
'Genre': genre,
'Director and stars':Dstars
})
test_df

Unnamed: 0,Movie,Year,Rating,Metascore,Votes,Genre,Director and stars
0,Logan,(2017),8.1,77,609971,"Action, Drama, Sci-Fi","[James Mangold, Hugh Jackman, Patrick Stewart,..."
1,Thor: Ragnarok,(2017),7.9,74,544280,"Action, Adventure, Comedy","[Taika Waititi, Chris Hemsworth, Tom Hiddlesto..."
2,Guardians of the Galaxy Vol. 2,(2017),7.6,67,533717,"Action, Adventure, Comedy","[James Gunn, Chris Pratt, Zoe Saldana, Dave Ba..."
3,Star Wars: The Last Jedi,(2017),7.0,85,532955,"Action, Adventure, Fantasy","[Rian Johnson, Daisy Ridley, John Boyega, Mark..."
4,Wonder Woman,(2017),7.4,76,523851,"Action, Adventure, Fantasy","[Patty Jenkins, Gal Gadot, Chris Pine, Robin W..."
5,Dunkirk,(2017),7.9,94,511617,"Action, Drama, History","[Christopher Nolan, Fionn Whitehead, Barry Keo..."
6,Spider-Man: Homecoming,(2017),7.4,73,479132,"Action, Adventure, Sci-Fi","[Jon Watts, Tom Holland, Michael Keaton, Rober..."
7,Get Out,(I) (2017),7.7,85,454426,"Horror, Mystery, Thriller","[Jordan Peele, Daniel Kaluuya, Allison William..."
8,It,(I) (2017),7.3,69,429420,Horror,"[Andy Muschietti, Bill Skarsgård, Jaeden Marte..."
9,Blade Runner 2049,(2017),8.0,81,424692,"Action, Drama, Mystery","[Denis Villeneuve, Harrison Ford, Ryan Gosling..."


In [13]:
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
genre = []
Dstars = []

for i in range(0,347330,50): 
    url = 'https://www.imdb.com/search/title/?release_date=1990-01-01,2020-12-31&sort=num_votes,desc&start={}&ref_=adv_nxt'.format(i)
    response = get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
    for container in movie_containers:
        if container.find('div', class_ = 'ratings-metascore') is not None:
            stars = []
            #name of the movie
            name = container.h3.a.text
            names.append(name)
            
            #year of release
            year = container.h3.find('span', class_ = 'lister-item-year').text
            years.append(year)
            
            #rating it got
            imdb = float(container.strong.text)
            imdb_ratings.append(imdb)
            
            #metascore of the movie
            m_score = container.find('span', class_ = 'metascore').text
            metascores.append(int(m_score))
            
            #no. of votes
            vote = container.find('span', attrs = {'name':'nv'})['data-value']
            votes.append(int(vote))
            
            #genre
            genres = container.find('span', class_ = 'genre').text
            genres = genres.replace("\n","")
            genre.append(genres)
            
            #directors and actors
            first_Director_Stars = container.find_all("p")
            first_Director_Stars = first_Director_Stars[2]
            first_Director_Stars = first_Director_Stars.find_all("a")
            for i in range(len(first_Director_Stars)):
                stars.append(first_Director_Stars[i].get_text())
            Dstars.append(stars)
            

KeyboardInterrupt: 

In [14]:
#Find the best movie among the bunch

test_df = pd.DataFrame({'Movie': names,
'Year': years,
'Rating': imdb_ratings,
'Metascore': metascores,
'Votes': votes,
'Genre': genre,
'Director and stars':Dstars,
})
test_df['Total'] = test_df["Rating"] + test_df["Metascore"] + test_df['Votes']
test_df.sort_values("Total",ascending=False)


Unnamed: 0,Movie,Year,Rating,Metascore,Votes,Genre,Director and stars,Total
0,The Shawshank Redemption,(1994),9.3,80,2223174,Drama,"[Frank Darabont, Tim Robbins, Morgan Freeman, ...",2223263.3
1,The Dark Knight,(2008),9.0,84,2197754,"Action, Crime, Drama","[Christopher Nolan, Christian Bale, Heath Ledg...",2197847.0
2,Inception,(2010),8.8,74,1949518,"Action, Adventure, Sci-Fi","[Christopher Nolan, Leonardo DiCaprio, Joseph ...",1949600.8
3,Fight Club,(1999),8.8,66,1771975,Drama,"[David Fincher, Brad Pitt, Edward Norton, Meat...",1772049.8
4,Pulp Fiction,(1994),8.9,94,1744055,"Crime, Drama","[Quentin Tarantino, John Travolta, Uma Thurman...",1744157.9
5,Forrest Gump,(1994),8.8,82,1715068,"Drama, Romance","[Robert Zemeckis, Tom Hanks, Robin Wright, Gar...",1715158.8
6,The Matrix,(1999),8.7,73,1597522,"Action, Sci-Fi","[Lana Wachowski, Lilly Wachowski, Keanu Reeves...",1597603.7
7,The Lord of the Rings: The Fellowship of the Ring,(2001),8.8,92,1588068,"Action, Adventure, Drama","[Peter Jackson, Elijah Wood, Ian McKellen, Orl...",1588168.8
8,The Lord of the Rings: The Return of the King,(2003),8.9,94,1575349,"Adventure, Drama, Fantasy","[Peter Jackson, Elijah Wood, Viggo Mortensen, ...",1575451.9
9,The Dark Knight Rises,(2012),8.4,78,1454011,"Action, Adventure","[Christopher Nolan, Christian Bale, Tom Hardy,...",1454097.4


# Recommender system(Content based)

In [15]:

def listToString(s):  
    string = ""  
    for element in s:  
        string += element   
    return string  

test_df["Director and stars"] = test_df["Director and stars"].apply(listToString)

def combine_features(df):
    return df['Genre'] +" "+ df['Director and stars']

test_df["Combined_features"] = test_df.apply(combine_features,axis=1)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

text = test_df['Combined_features']#.astype(str)
cv = CountVectorizer()

count_matrix = cv.fit_transform(text)
print(count_matrix)


  (0, 7381)	1
  (0, 3488)	1
  (0, 3004)	1
  (0, 7128)	1
  (0, 2042)	1
  (0, 2984)	1
  (0, 2354)	1
  (1, 1204)	1
  (1, 2461)	1
  (1, 4948)	1
  (1, 432)	1
  (1, 6217)	1
  (1, 1466)	1
  (1, 1800)	1
  (1, 31)	1
  (1, 2354)	1
  (2, 8835)	1
  (2, 6370)	1
  (2, 5069)	1
  (2, 3331)	1
  (2, 2248)	1
  (2, 6221)	1
  (2, 2784)	1
  (2, 7563)	1
  (2, 64)	1
  :	:
  (2003, 622)	1
  (2003, 6583)	1
  (2003, 1197)	1
  (2003, 8132)	1
  (2003, 3497)	1
  (2003, 6298)	1
  (2003, 3945)	1
  (2003, 8404)	1
  (2003, 2354)	1
  (2004, 8475)	1
  (2004, 5020)	1
  (2004, 5347)	1
  (2004, 5)	1
  (2004, 2918)	1
  (2004, 4619)	1
  (2004, 5421)	1
  (2004, 2354)	1
  (2005, 6431)	1
  (2005, 6399)	1
  (2005, 8601)	1
  (2005, 8053)	1
  (2005, 4849)	1
  (2005, 9307)	1
  (2005, 8404)	1
  (2005, 2354)	1


In [17]:
similarity_scores = cosine_similarity(count_matrix)
similarity_scores
similarity_scores[22]

array([0.11952286, 0.42163702, 0.28603878, ..., 0.10540926, 0.1118034 ,
       0.1118034 ])

In [18]:
def get_title_from_index(index):
    return test_df["Movie"][index]

def get_index_from_title(title):
    return test_df[test_df.Movie == title].index.values[0]

In [19]:
get_title_from_index(22)

'The Prestige'

In [20]:
get_index_from_title("Thor")

76

In [26]:
# Content based recommender 
movie_user_likes = input("Enter the name of a movie you like: ")
movie_index = get_index_from_title(movie_user_likes)
similar_movies =  list(enumerate(similarity_scores[movie_index]))
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)
i=0
for element in sorted_similar_movies:
        print(get_title_from_index(element[0]))
        i=i+1
        if i>10:
            break

Enter the name of a movie you like: The Wolf of Wall Street	


IndexError: index 0 is out of bounds for axis 0 with size 0