# Web scraping 

In [None]:
from requests import get
url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)
print(response.text[:500])

In [None]:
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
html_soup

In [None]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

In [None]:
first_movie = html_soup('div', class_ = 'lister-item mode-advanced')[0]
first_movie.h3.a.get_text()

In [None]:
first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold')
first_year.get_text()

In [None]:
first_rating = float(first_movie.strong.text)
first_rating

In [None]:
first_meta_score = first_movie.find('span', class_ = 'metascore favorable')
first_meta_score = float(first_meta_score.text)
first_meta_score

In [None]:
first_votes = first_movie.find('span', attrs = {'name':'nv'})
first_votes = float(first_votes['data-value'])
first_votes

In [None]:
first_genre = first_movie.find('span', class_ = 'genre')
first_genre = first_genre.text
first_genre

In [None]:
first_Director_Stars = first_movie.find_all("p")
first_Director_Stars = first_Director_Stars[2]
first_Director_Stars = first_Director_Stars.find_all("a")
stars = []
for i in range(len(first_Director_Stars)):
    stars.append(first_Director_Stars[i].get_text())

stars

# Combining all of the above methods inside a single for loop 

In [None]:
# Lists to store the scraped data in

names = []
years = []
imdb_ratings = []
metascores = []
votes = []
genre = []
Dstars = []

# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:
        stars = []
# The name
        name = container.h3.a.text
        names.append(name)
# The year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)
# The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
# The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))
# The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))
# Genres
        genres = container.find('span', class_ = 'genre').text
        genres = genres.replace("\n","")
        genre.append(genres)
# Director and stars
        first_Director_Stars = container.find_all("p")
        first_Director_Stars = first_Director_Stars[2]
        first_Director_Stars = first_Director_Stars.find_all("a")
        for i in range(len(first_Director_Stars)):
            stars.append(first_Director_Stars[i].get_text())
        Dstars.append(stars)
            

# Making a dataframe using the data we got in the previous step

In [None]:
import pandas as pd
test_df = pd.DataFrame({'Movie': names,
'Year': years,
'Rating': imdb_ratings,
'Metascore': metascores,
'Votes': votes,
'Genre': genre,
'Director and stars':Dstars
})
test_df

In [None]:
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
genre = []
Dstars = []

for i in range(0,347330,50): 
    url = 'https://www.imdb.com/search/title/?release_date=1990-01-01,2020-12-31&sort=num_votes,desc&start={}&ref_=adv_nxt'.format(i)
    response = get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
    for container in movie_containers:
        if container.find('div', class_ = 'ratings-metascore') is not None:
            stars = []
            #name of the movie
            name = container.h3.a.text
            names.append(name)
            
            #year of release
            year = container.h3.find('span', class_ = 'lister-item-year').text
            years.append(year)
            
            #rating it got
            imdb = float(container.strong.text)
            imdb_ratings.append(imdb)
            
            #metascore of the movie
            m_score = container.find('span', class_ = 'metascore').text
            metascores.append(int(m_score))
            
            #no. of votes
            vote = container.find('span', attrs = {'name':'nv'})['data-value']
            votes.append(int(vote))
            
            #genre
            genres = container.find('span', class_ = 'genre').text
            genres = genres.replace("\n","")
            genre.append(genres)
            
            #directors and actors
            first_Director_Stars = container.find_all("p")
            first_Director_Stars = first_Director_Stars[2]
            first_Director_Stars = first_Director_Stars.find_all("a")
            for i in range(len(first_Director_Stars)):
                stars.append(first_Director_Stars[i].get_text())
            Dstars.append(stars)
            

In [None]:
#Find the best movie among the bunch

test_df = pd.DataFrame({'Movie': names,
'Year': years,
'Rating': imdb_ratings,
'Metascore': metascores,
'Votes': votes,
'Genre': genre,
'Director and stars':Dstars,
})
test_df['Total'] = test_df["Rating"] + test_df["Metascore"] + test_df['Votes']
test_df.sort_values("Total",ascending=False)


# Recommender system(Content based)

In [None]:

def listToString(s):  
    string = ""  
    for element in s:  
        string += element   
    return string  

test_df["Director and stars"] = test_df["Director and stars"].apply(listToString)

def combine_features(df):
    return df['Genre'] +" "+ df['Director and stars']

test_df["Combined_features"] = test_df.apply(combine_features,axis=1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

text = test_df['Combined_features']#.astype(str)
cv = CountVectorizer()

count_matrix = cv.fit_transform(text)
print(count_matrix)


In [None]:
similarity_scores = cosine_similarity(count_matrix)
similarity_scores
similarity_scores[22]

In [None]:
def get_title_from_index(index):
    return test_df["Movie"][index]

def get_index_from_title(title):
    return test_df[test_df.Movie == title].index.values[0]

In [None]:
get_title_from_index(22)

In [None]:
get_index_from_title("Thor")

In [None]:
# Content based recommender 
movie_user_likes = input("Enter the name of a movie you like: ")
movie_index = get_index_from_title(movie_user_likes)
similar_movies =  list(enumerate(similarity_scores[movie_index]))
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)
i=0
for element in sorted_similar_movies:
        print(get_title_from_index(element[0]))
        i=i+1
        if i>10:
            break