In [None]:
"""
WEB SCRAPPING

#To install requests-html and jupyter in virtualenv
pipenv install requests -html 

#If you are creating a new folder. Install Jupyternotebook
pipenv install jupyter notebook

#To start jupyter notebook from virtualenv
pipenv run jupyter notebook

#For additional info go to: 
http://python-requests.org/

#print(response.content) - will get you the content of the website

#alway add "." before the class for e.g. ".subtext"

#Reference: https://github.com/pycampers/foundation_bootcamp/blob/master/6th_sept_web_scraping.ipynb

#first = True - means just give the frist value not the list

"""

In [None]:
"""
Steps to download the subtitle of a movie:

1. Ask the user movie name
2. Once we have the movie name, we go to the website and search for the movie and get the response.
3. Get all the results
4. Exit if no result is found
5. take movie search results convert them into a dictionary
6. ask user to select the movie from the list
7. get the selected movie page
8. find a list of subtitles for that movie
9. convert html subtitles to dictionary
10. select subtitle based on language and rating
11. get the zip file url from the movie page
12. download the zip file

"""


In [2]:
from urllib.parse import quote
from requests_html import HTMLSession
import requests


class SubtitleDownloader:
    """
    An example use of this class.
    sub_downloader = SubtitleDownloader(verbose=True)


    sub_downloader.search_for_subtitle("fight club")
    sub_downloader.response_to_dict()
    sub_downloader.select_movie_number(movie_number=1)
    sub_downloader.get_subtitles_from_selected_movie()
    sub_downloader.download_subtitle_zip()
    """
    def __init__(self,verbose=False):
        self.response = None
        self.verbose = verbose
        
        
    def url_to_response(self, url):
        """takes a url, make a request and return the response"""
        
        session = HTMLSession()
        response = session.get(url)
        
        return response
    
    def search_for_subtitle(self, query):
        """Take a query string, make the url to search and get the response"""
        website_url = "https://yts-subs.com/"
        query_encoded = quote(query)
        website_search_url = website_url + "search/" + query_encoded

        self.response = self.url_to_response(website_search_url)
        
        if self.verbose == True:
            print(self.response)
        else:
            pass
        
        return self.response
    
    def search_result_to_movie_info(self, single_search_result):
        """Takes html of a search result, converts it into a dict
        and return the dict.
        """
        poster = single_search_result.find('.media-object', first=True).attrs["src"]
        title = single_search_result.find(".media-heading", first=True).text

        year_raw = single_search_result.find(".movinfo-section")[0].text
        year = int(year_raw.split()[0])
        movie_page_raw = single_search_result.find(".media-body", first=True).find("a", first=True)
        movie_page = movie_page_raw.absolute_links.pop()

        movie_info = {"title":title,
                     "year": year,
                     "poster": poster,
                     "movie_page": movie_page}

        return movie_info
    
    def response_to_dict(self):
        """Takes the reponse page, finds all the search results
        convert search results into dict and return the list of dicts."""
        search_results = self.response.html.find('.media-movie-clickable') # get all the results

        if len(search_results) > 0:
            if self.verbose == True:
                print(f'Found {len(search_results)} results.')
            else:
                pass
        
        else:
            print("No results found.")
            return 0 # exit is no result is found
        

        # take search results convert them into a dict
        movie_info_list = list(map(self.search_result_to_movie_info, search_results))
        
        if self.verbose == True:
            print(f'Parsed {len(movie_info_list)} results.')
        else:
            pass
        
        self.movie_info_list = movie_info_list
        
        return self.movie_info_list
        
        
    def select_movie_number(self, movie_number=None):
        """Let the user select a movie to download subtitle for. """

        if movie_number == None:
            for i in range(len(self.movie_info_list)):
                single_movie_info = self.movie_info_list[i]
                print(f'{i+1} : {single_movie_info["title"]} - {single_movie_info["year"]}')

            selected_movie = int(input("Enter the movie number: "))
        else:
            selected_movie = movie_number

        selected_movie_info  = self.movie_info_list[selected_movie - 1]
        self.selected_movie_info = selected_movie_info
        return self.selected_movie_info

    
    def subtitle_html_to_dict(self, single_subtitle):
        """Takes html for single subtitle result and convert it to dict then return it."""
        subtitle_language = single_subtitle.find(".sub-lang", first=True).text

        download_cell = single_subtitle.find('.download-cell', first=True)
        download_page_link = download_cell.find("a", first=True).absolute_links.pop()

        subtitle_rating = int(single_subtitle.find(".rating-cell", first=True).text)

        subtitle_dict = {"language": subtitle_language,
                        "download_page": download_page_link,
                        "rating": subtitle_rating}

        return subtitle_dict
                      
    def filter_subtitles(self, subtitle_info_list, filter_language="English"):
        """Takes all the subtitle dicts and filter them for language and takes the first one."""
        results = []
        for single_subtitle in subtitle_info_list:
            if single_subtitle['language'] == filter_language:
                results.append(single_subtitle)

        if len(results) == 0:
            selected_subtitle = None
        else:
            selected_subtitle = results[0]

        return selected_subtitle
                      
    def get_subtitles_from_selected_movie(self):
        """Find all the sub for selected movie and select the best one."""
        # get the selected movie page
        movie_page_response = self.url_to_response(self.selected_movie_info["movie_page"])

        # find the list of subtitles for that movie
        subtitle_list = movie_page_response.html.find('.high-rating')
        
        if self.verbose == True:
            print(f'Found {len(subtitle_list)} subtitles for selected movie.')
        else:
            pass


        # convert html subtitles to dict
        subtitle_info_list = list(map(self.subtitle_html_to_dict, subtitle_list))
                      
        # select subtitle based on language and rating
        final_subtitle = self.filter_subtitles(subtitle_info_list)
        self.final_subtitle = final_subtitle

    def download_page_to_zip_url(self):
        """Get the download link for the zip file from the download page."""
        response = self.url_to_response(self.final_subtitle["download_page"])
        download_link = response.html.find('.download-subtitle', first=True).attrs["href"]
        return download_link


    def download_zip_file(self, zip_url):
        """Downloads the zip file in the current directory."""
        zip_file_data = requests.get(zip_url).content
        filename = zip_url.split("/")[-1]

        with open(filename, "wb") as zip_file:
            zip_file.write(zip_file_data)

        return 0
                      
    def download_subtitle_zip(self):
        """get the zip url and download it."""
        
        # get the zip file url from the movie page
        zip_url = self.download_page_to_zip_url()

        # download the zip file
        self.download_zip_file(zip_url)
                      
        if self.verbose == True:
            print("Downloaded Subtitle in the current Directory.")
        else:
            pass
                      
        return 0
        
        
sub_downloader = SubtitleDownloader(verbose=True)


sub_downloader.search_for_subtitle("fight club")
sub_downloader.response_to_dict()
sub_downloader.select_movie_number(movie_number=1)
sub_downloader.get_subtitles_from_selected_movie()
sub_downloader.download_subtitle_zip()

ModuleNotFoundError: No module named 'requests_html'

In [4]:
sub_downloader = SubtitleDownloader()

help(sub_downloader)

NameError: name 'SubtitleDownloader' is not defined