sickbeard/providers/newpct.py

# coding=utf-8
# Author: CristianBB
# Greetings to Mr. Pine-apple
# URL: https://sickrage.github.io
#
# This file is part of SickRage.
#
# SickRage is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# SickRage is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with SickRage. If not, see <http://www.gnu.org/licenses/>.

from __future__ import print_function, unicode_literals

import re

from requests.compat import urljoin

from sickbeard import helpers, logger, tvcache
from sickbeard.bs4_parser import BS4Parser
from sickrage.helper.common import convert_size
from sickbeard.show_name_helpers import allPossibleShowNames
from sickrage.providers.torrent.TorrentProvider import TorrentProvider


class newpctProvider(TorrentProvider):

    def __init__(self):

        TorrentProvider.__init__(self, 'Newpct')

        self.onlyspasearch = None

        self.url = 'http://www.tvsinpagar.com'
        self.urls = {'search': [ urljoin(self.url, '/series'),
                                 urljoin(self.url, '/series-hd')],
                                 #urljoin(self.url, '/series-vo')],
                     'rss': urljoin(self.url, '/ultimas-descargas'),
                     'letter': [ urljoin(self.url, '/series/letter/{0}'),
                                 urljoin(self.url, '/series-hd/letter/{0}')],
                                 #urljoin(self.url, '/series-vo/letter/{0}')],
                     'downloadregex': r'[^\"]*/descargar-torrent/\d+_[^\"]*',}

        self.recent_url = '';
        self.cache = tvcache.TVCache(self, min_time=20)

    def _get_season_search_strings(self, ep_obj):
        search_string = {'Season': []}

        for show_name in set(allPossibleShowNames(ep_obj.show)):
            search_string['Season'].append(show_name)

        return [search_string]

    def _get_episode_search_strings(self, ep_obj, add_string=''):
        search_string = {'Episode': []}

        for show_name in set(allPossibleShowNames(ep_obj.show)):
            search_string['Episode'].append(show_name)

        return [search_string]

    def search(self, search_strings, age=0, ep_obj=None):  # pylint: disable=too-many-locals

        results = []

        # Only search if user conditions are true
        lang_info = '' if not ep_obj or not ep_obj.show else ep_obj.show.lang

        for mode in search_strings:
            items = []
            logger.log('Search Mode: {0}'.format(mode), logger.DEBUG)

            if mode == 'RSS':
            
                recent_url = self.recent_url
                pg = 1
                while pg <= 10:    
                    try:
                        data = self.get_url(self.urls['rss'] + '/pg/' + str(pg) , params=None, returns='text')
                        items = self.parseRSS(data, mode)
                        if not len(items):
                            break
                        results += items

                        if pg == 1:
                            self.recent_url = items[0]['link']
                            
                        item_found = [item for item in items if item['link'] == recent_url]
                        if len(item_found):
                            logger.log('Previous search found in this page. Skipping next pages...', logger.DEBUG)
                            break
                        
                    except Exception:
                        logger.log('No data returned from provider', logger.DEBUG)
                        break

                    pg += 1

            else:

                # Only search if user conditions are true
                if self.onlyspasearch and lang_info != 'es':
                    logger.log('Show info is not spanish, skipping provider search', logger.DEBUG)
                    continue
                    
                letters = []
                series_names_lower = [x.lower() for x in search_strings[mode]]

                #search series name
                for series_name in series_names_lower:
                    name = series_name.lower().strip()
                    if name and (name[0] not in letters):
                        letters.append(name[0])
                    
                for letter in letters:
                    for letter_url in self.urls['letter']:
                        url = letter_url.format(letter) if not letter.isdigit() else letter_url.format('0-9')
                        
                        try:
                            data = self.get_url(url, params=None, returns='text')
                            seriesparsed = self.parse_seriestitleurl(series_names_lower, data)
                            if not len(seriesparsed):
                                continue
                                
                            for seriesparseditem in seriesparsed:
                                pg = 1
                                while pg < 100:
                                    try:
                                        data = self.get_url(seriesparseditem['url'] + '/pg/' + str(pg) , params=None, returns='text')
                                        items = self.parse(seriesparseditem['title'], data, mode)
                                        if not len(items):
                                            break
                                        results += items
                                    except Exception:
                                        logger.log('No data returned from provider', logger.DEBUG)
                                        break
     
                                    pg += 1
                            
                        except Exception as e:
                            logger.log('No data returned from provider (letter) {0}'.format(str(e)), logger.DEBUG)
                            continue

            results += items

        return results
        
        
    def parse_seriestitleurl(self, series_names, data):
        results = []

        with BS4Parser(data) as html:
            series_table = html.find('ul', class_='pelilist')
            series_rows = series_table('li') if series_table else []
     
            # Continue only if at least one series is found
            if not len(series_rows):
                return results
                
            for row in series_rows:
                try:
                    series_anchor = row.find_all('a')[0]
                    title = series_anchor.get('title', '').lower()
                    url = series_anchor.get('href', '')
                    if title and title in series_names:
                        item = {
                            'title': title,
                            'url': url,
                        }                    
                        results.append(item)
                except Exception as e:
                    continue

        return results

        
    def parseRSS(self, data, mode):

        results = []

        with BS4Parser(data) as html:
            torrent_table = html.find('ul', class_='noticias-series')
            torrent_rows = torrent_table('li') if torrent_table else []

            # Continue only if at least one release is found
            if not len(torrent_rows):
                sickrage.app.srLogger.debug('Data returned from provider does not contain any torrents')
                return results

            for row in torrent_rows:
                try:
                    torrent_anchor = row.find_all('a')[1]
                    title = torrent_anchor.get_text()
                    download_url = torrent_anchor.get('href', '')
                    size = 0
                    seeders = 1  # Provider does not provide seeders
                    leechers = 0  # Provider does not provide leechers                    
                    if not all([title, download_url]):
                        continue

                    row_spans = row.find_all('span')
                    row_strongs = row.find_all('strong')
                    
                    #if there's no episode_text, is not an episode
                    if len(row_spans) < 3 or 'Capitulo' not in row_spans[2].get_text():
                        continue

                    size_text = row_strongs[0].get_text()
                    quality = row_spans[0].get_text().replace(size_text, '').strip()
                    size_text = size_text.replace(u'Tama\u00f1o', '').strip()
                    size = convert_size(size_text)
                    language = row_strongs[1].get_text().strip()
                    title = 'Serie ' + title + ' - ' + language + ' Calidad [' + quality + ']'

                    title = self._processTitle(title, None, download_url)
                    logger.log('Found: {0} # Size {1}'.format(title, size), logger.DEBUG)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                    }

                    results.append(item)

                except (AttributeError, TypeError):
                    continue

        return results
        
        
    def parse(self, series_name, data, mode):
        """
        Parse search results for items.

        :param data: The raw response from a search
        :param mode: The current mode used to search, e.g. RSS

        :return: A list of items found
        """

        results = []

        with BS4Parser(data) as html:
            torrent_table = html.find('ul', class_='buscar-list')
            torrent_rows = torrent_table('li') if torrent_table else []

            # Continue only if at least one release is found
            if not len(torrent_rows):
                sickrage.app.srLogger.debug('Data returned from provider does not contain any torrents')
                return results

            for row in torrent_rows:
                try:
                    torrent_anchor = row.find_all('a')[1]
                    title = torrent_anchor.get_text()
                    download_url = torrent_anchor.get('href', '')

                    if not all([title, download_url]):
                        continue

                    row_spans = row.find_all('span')
                    size = convert_size(row_spans[-2].get_text().strip()) if row_spans and len(row_spans) >= 2 else 0
                    seeders = 1  # Provider does not provide seeders
                    leechers = 0  # Provider does not provide leechers

                    title = self._processTitle(title, series_name, download_url)

                    logger.log('Found: {0} # Size {1}'.format(title, size), logger.DEBUG)

                    item = {
                        'title': title,
                        'link': download_url,
                        'size': size,
                        'seeders': seeders,
                        'leechers': leechers,
                    }

                    results.append(item)

                except (AttributeError, TypeError):
                    continue

        return results


    def get_url(self, url, post_data=None, params=None, timeout=30, **kwargs):  # pylint: disable=too-many-arguments
        """
        returns='content' when trying access to torrent info (For calling torrent client). Previously we must parse
        the URL to get torrent file
        """
        trickery = kwargs.pop('returns', '')
        if trickery == 'content':
            kwargs['returns'] = 'text'
            data = super(newpctProvider, self).get_url(url, post_data=post_data, params=params, timeout=timeout, **kwargs)
            
            match = re.search(r'' + self.urls['downloadregex'], data, re.DOTALL)
            if not match:
                return None
            url = match.group()

        kwargs['returns'] = trickery
        return super(newpctProvider, self).get_url(url, post_data=post_data, params=params,
                                                   timeout=timeout, **kwargs)

    def download_result(self, result):
        """
        Save the result to disk.
        """

        # check for auth
        if not self.login():
            return False

        urls, filename = self._make_url(result)

        for url in urls:
            # Search results don't return torrent files directly, it returns show sheets so we must parse showSheet to access torrent.
            data = self.get_url(url, returns='text')

            match = re.search(r'' + self.urls['downloadregex'], data, re.DOTALL)
            if not match:
                continue
            url_torrent = match.group()
            
            if url_torrent.startswith('http'):
                self.headers.update({'Referer': '/'.join(url_torrent.split('/')[:3]) + '/'})

            logger.log('Downloading a result from {0}'.format(url))

            if helpers.download_file(url_torrent, filename, session=self.session, headers=self.headers):
                if self._verify_download(filename):
                    logger.log('Saved result to {0}'.format(filename), logger.INFO)
                    return True
                else:
                    logger.log('Could not download {0}'.format(url), logger.WARNING)
                    helpers.remove_file_failed(filename)

        if urls:
            logger.log('Failed to download any results', logger.WARNING)

        return False

    def _processTitle(self, title, series_name, url, try_download = True):

        # Newpct titles are very very very inconsistent.

        # Check if title is well formatted (RSS titles usually are)
        # Examples:
        # FooSeries - Temporada 2 [HDTV 720p][Cap.204][AC3 5.1 EspaÃ±ol Castellano]
        # Salvation - Temporada 1 [HDTV][Cap.104-107][EspaÃ±ol Castellano]

        # else try to match list format
        # example
        # Serie Juego De Tronos  Temporada 7 Capitulo 5 - Español Castellano Calidad [ HDTV ]
        # Serie Juego De Tronos  Temporada [7] Capitulo [5] - Español Castellano Calidad [ HDTV ]

        # else process download page title
        # else compose from download url

        series_name = series_name or ""

        logger.log('newpct _processTitle: {} # series_name {} # url {}'.format(title, series_name, url), logger.DEBUG)

        #clean spaces
        title = self._clean_spaces(title)
        series_name = self._clean_spaces(series_name)

        title_stdformat = r'.+-.+\d{1,2}.+\[Cap.\d{2,4}([\-\_]\d{2,4})?\]'
        title_listformat = r'Serie ?(.+?) ?-? ?Temporada ?\[?(\d+)\]?.*Capitulos? ?\[?(\d+)\]? ?(al ?\[?(\d+)\]?)?.*- ?(.*) ?Calidad ?(.+)'
        title_urlformat = r'.*\/(.*)\/capitulo-(\d{2,4})\/'

        title_is_proper = re.search(r'\b(proper|repack)', title, flags=re.I)

        stdformat_match = re.search(title_stdformat, title, flags=re.I)
        if not stdformat_match:
            #Try to match list format
            listformat_match = re.search(title_listformat, title, flags=re.I)
            if listformat_match:
                if series_name:
                    name = series_name + ((' (' + title_is_proper.group() + ')') if title_is_proper else "")
                else:
                    name = self._clean_spaces(listformat_match.group(1))
                season = self._clean_spaces(listformat_match.group(2))
                episode = self._clean_spaces(listformat_match.group(3)).zfill(2)
                audioquality = self._clean_spaces(listformat_match.group(6))
                quality = self._clean_spaces(listformat_match.group(7))

                if not listformat_match.group(5):
                    title = "{0} - Temporada {1} {2} [Cap.{3}{4}][{5}]".format(name, season, quality, season, episode, audioquality)
                else:
                    episode_to = self._clean_spaces(listformat_match.group(5)).zfill(2)
                    title = "{0} - Temporada {1} {2} [Cap.{3}{4}_{5}{6}][{7}]".format(name, season, quality, season, episode, season, episode_to, audioquality)
                logger.log('_processTitle: Matched by listFormat: {}'.format(title), logger.DEBUG)
            else:
                if try_download:
                    # Get title from the download page
                    try:
                        data = self.get_url(url, params=None, returns='text')
                        with BS4Parser(data) as details:
                            title = details.find('h1').get_text().split('/')[1]
                            logger.log('_processTitle: Title got from details page: {}'.format(title), logger.DEBUG)
                            return self._processTitle(title, series_name, url, False)
                    except (AttributeError, TypeError):
                        logger.error('title could not be retrived')
                else:
                    # Try to compose title from url
                    url_match = re.search(title_urlformat, url, flags=re.I)
                    if url_match:
                        name = series_name if series_name else url_match.group(1).replace('-', ' ')
                        season, episode = self._process_season_episode(url_match.group(2))
                        title = '{} - Temporada {} [][Cap.{}{}]'.format(name, season, season, episode)
                        logger.log('_processTitle: Matched by url: {}'.format(title), logger.DEBUG)
        else:
            logger.log('_processTitle: Matched by stdFormat: {}'.format(title), logger.DEBUG)
            
        # Quality - Use re module to avoid case sensitive problems with replace
        title = re.sub(r'\[HDTV 1080p?[^\[]*]', '1080p HDTV x264', title, flags=re.I)
        title = re.sub(r'\[HDTV 720p?[^\[]*]', '720p HDTV x264', title, flags=re.I)
        title = re.sub(r'\[ALTA DEFINICION 720p?[^\[]*]', '720p HDTV x264', title, flags=re.I)
        title = re.sub(r'\[HDTV]', 'HDTV x264', title, flags=re.I)
        title = re.sub(r'\[DVD[^\[]*]', 'DVDrip x264', title, flags=re.I)
        title = re.sub(r'\[BluRay 1080p?[^\[]*]', '1080p BluRay x264', title, flags=re.I)
        title = re.sub(r'\[BluRay Rip 1080p?[^\[]*]', '1080p BluRay x264', title, flags=re.I)
        title = re.sub(r'\[BluRay Rip 720p?[^\[]*]', '720p BluRay x264', title, flags=re.I)
        title = re.sub(r'\[BluRay MicroHD[^\[]*]', '1080p BluRay x264', title, flags=re.I)
        title = re.sub(r'\[MicroHD 1080p?[^\[]*]', '1080p BluRay x264', title, flags=re.I)
        title = re.sub(r'\[BLuRay[^\[]*]', '720p BluRay x264', title, flags=re.I)
        title = re.sub(r'\[BRrip[^\[]*]', '720p BluRay x264', title, flags=re.I)
        title = re.sub(r'\[BDrip[^\[]*]', '720p BluRay x264', title, flags=re.I)

        #detect hdtv/bluray by url
        #hdtv 1080p example url: http://www.newpct.com/descargar-seriehd/foo/capitulo-610/hdtv-1080p-ac3-5-1/
        #hdtv 720p example url: http://www.newpct.com/descargar-seriehd/foo/capitulo-26/hdtv-720p-ac3-5-1/
        #hdtv example url: http://www.newpct.com/descargar-serie/foo/capitulo-214/hdtv/
        #bluray compilation example url: http://www.newpct.com/descargar-seriehd/foo/capitulo-11/bluray-1080p/
        #http://www.tvsinpagar.com/descargar/serie-vo/marvels-agents-of-s-h-i-e-l-d-/temporada-5/capitulo-12/
        #http://www.tvsinpagar.com/descargar/serie-en-hd/the-arrangement/temporada-2/capitulo-01/
        title_hdtv = re.search(r'HDTV', title, flags=re.I)
        title_720p = re.search(r'720p', title, flags=re.I)
        title_1080p = re.search(r'1080p', title, flags=re.I)
        title_x264 = re.search(r'x264', title, flags=re.I)
        title_bluray = re.search(r'bluray', title, flags=re.I)
        title_vo = re.search(r'\[V.O.[^\[]*]', title, flags=re.I)
        title_subt = re.search(r'\[Ingles subtitulado\]', title, flags=re.I)
        url_hdtv = re.search(r'HDTV', url, flags=re.I)
        url_720p = re.search(r'720p', url, flags=re.I)
        url_1080p = re.search(r'1080p', url, flags=re.I)
        url_bluray = re.search(r'bluray', url, flags=re.I)
        url_serie_hd = re.search(r'descargar-seriehd', url, flags=re.I)
        url_serie_hd = url_serie_hd or re.search(r'descargar/serie-en-hd', url, flags=re.I)
        url_serie_vo = re.search(r'descargar-serievo', url, flags=re.I)
        url_serie_subt = re.search(r'descargar/serie-vo', url, flags=re.I)

        if not title_hdtv and url_hdtv:
            title += ' HDTV'
            if not title_x264:
                title += ' x264'
        if not title_bluray and url_bluray:
            title += ' BluRay'
            if not title_x264:
                title += ' x264'
        if not title_1080p and url_1080p:
            title += ' 1080p'
            title_1080p = True
        if not title_720p and url_720p:
            title += ' 720p'
            title_720p = True
        if not (title_720p or title_1080p) and url_serie_hd:
            title += ' 720p'
        if not (title_vo) and (title_subt or url_serie_vo or url_serie_subt):
            title += ' [V.O.]'
            title_vo = True

        # Language
        # title = re.sub(r'\[Spanish[^\[]*]', 'SPANISH AUDIO', title, flags=re.I)
        # title = re.sub(r'\[Castellano[^\[]*]', 'SPANISH AUDIO', title, flags=re.I)
        # title = re.sub(ur'\[Espa\u00f1ol[^\[]*]', 'SPANISH AUDIO', title, flags=re.I)
        # title = re.sub(ur'\[Espa\u00f1ol Castellano[^\[]*]', 'SPANISH AUDIO', title, flags=re.I)
        # title = re.sub(r'\[AC3 5\.1[^\[]*]', 'SPANISH AUDIO', title, flags=re.I)
        # title = re.sub(ur'\[AC3 5\.1 Espa\u00f1ol[^\[]*]', 'SPANISH AUDIO', title, flags=re.I)
        # title = re.sub(ur'\[AC3 5\.1 Espa\u00f1ol Castellano[^\[]*]', 'SPANISH AUDIO', title, flags=re.I)

        if title_vo:
            title += ' -NEWPCTVO'
        else:
            title += ' -SPANISH AUDIO'
            title += ' -NEWPCT'

        #propers handling
        title = re.sub(r'\(?proper\)?', '-PROPER', title, flags=re.I)
        title = re.sub(r'\(?repack\)?', '-REPACK', title, flags=re.I)

        return self._clean_spaces(title)

    def _process_season_episode(self, season_episode):

        match = re.search(r'(\d)(\d{1,2})', season_episode, flags=re.I)
        if not match:
            match = re.search(r'(\d{2})(\d{2})', season_episode, flags=re.I)

        season = match.group(1)
        episode = match.group(2).zfill(2)

        return season, episode

    def _clean_spaces(self, value):

        value = value.strip()
        value = re.sub(r'[ ]+', ' ', value, flags=re.I)
        value = re.sub(r'\[[ ]+', '[', value, flags=re.I)
        value = re.sub(r'[ ]+\]', ']', value, flags=re.I)
        value = re.sub(r'\([ ]+', '(', value, flags=re.I)
        value = re.sub(r'[ ]+\)', ')', value, flags=re.I)

        return value

provider = newpctProvider()