diff --git a/socid_extractor/schemes.py b/socid_extractor/schemes.py index 9b53247..bb386d1 100644 --- a/socid_extractor/schemes.py +++ b/socid_extractor/schemes.py @@ -31,7 +31,7 @@ } }, 'Twitter GraphQL API': { - 'flags': ['{"data":{"'], + 'flags': ['{"data":{"', 'user":{"id":'], 'regex': r'^{"data":{"user":({.+})}}$', 'extract_json': True, 'url_mutations': [ @@ -56,8 +56,8 @@ } }, 'Facebook user profile': { - 'flags': ['Facebook'], + 'regex': r'({"__bbox":{"complete".+"sequence_number":0}})', 'extract_json': True, 'transforms': [ json.loads, @@ -79,7 +79,7 @@ }, 'GitHub HTML': { 'flags': ['github.githubassets.com'], - 'regex': r'data-scope-id="(?P\d+)" data-scoped-search-url="/users/(?P.+?)/search"' + 'regex': r'data-hydro-click.+?profile_user_id":(?P\d+).+?originating_url":"https:\/\/github\.com\/(?P[^&]+)' }, # https://api.github.com/users/torvalds 'GitHub API': { @@ -105,8 +105,8 @@ } }, 'Gitlab API': { - 'flags': ['"web_url":"https://gitlab.com/'], - 'regex': r'^([{[\S\s]+?}])$', + 'flags': ['avatar_url', 'https://gitlab.com'], + 'regex': r'^\[({[\S\s]+?})\]$', 'extract_json': True, 'url_mutations': [ { @@ -115,13 +115,13 @@ } ], 'fields': { - 'uid': lambda x: x[0].get('id'), - 'fullname': lambda x: x[0].get('name'), - 'username': lambda x: x[0].get('username'), - 'state': lambda x: x[0].get('state'), - 'image': lambda x: x[0].get('avatar_url'), + 'uid': lambda x: x.get('id'), + 'fullname': lambda x: x.get('name'), + 'username': lambda x: x.get('username'), + 'state': lambda x: x.get('state'), + 'image': lambda x: x.get('avatar_url'), } - }, + }, 'Patreon': { 'flags': ['www.patreon.com/api', 'pledge_url'], 'regex': r'Object.assign\(window.patreon.bootstrap, ([\s\S]*)\);[\s\S]*Object.assign\(window.patreon.campaignFeatures, {}\);', @@ -167,12 +167,12 @@ } }, 'Yandex Disk file': { - 'flags': ['@yandexdisk', 'yastatic.net'], + 'flags': ["project:'disk-public',page:'icon'", '@yandexdisk', 'yastatic.net'], 'regex': r'"users":{.*?"uid":"(?P\d+)","displayName":"(?P.+?)"', }, 'Yandex Disk photoalbum': { - 'flags': ['yastatic.net/disk/album', 'isAvailableToAlbum'], - 'regex': r'"display_name":"(?P.*?)","uid":"(?P\d+)","locale":"\w+","login":"(?P.*?)"', + 'flags': ["project:'disk-public',page:'album'"], + 'regex': r'"users":{.*?"uid":"(?P\d+)","displayName":"(?P.+?)"', }, 'Yandex Music AJAX request': { 'flags': ['{"success":true,"verified'], @@ -292,7 +292,7 @@ }, 'Yandex Realty offer': { 'flags': ['realty.yandex.ru/offer'], - 'regex': r'({"routing":{"locationBeforeTransitions.+?});', + 'regex': r'({"routing":{"currentRoute".+?});', 'extract_json': True, 'fields': { 'your_yuid': lambda x: x['user']['yuid'], @@ -302,8 +302,8 @@ 'your_name': lambda x: x['user'].get('displayName'), 'your_username': lambda x: x['user'].get('defaultEmail'), 'your_phone': lambda x: x['user'].get('defaultPhone'), - 'yandex_uid': lambda x: x['cards']['offers']['author']['id'], - 'name': lambda x: decode_ya_str(x['cards']['offers']['author']['agentName']) + 'yandex_uid': lambda x: x['offerCard']['card']['author']['id'], + 'name': lambda x: decode_ya_str(x['offerCard']['card']['author']['profile']['name']) } }, 'Yandex Collections': { @@ -448,7 +448,7 @@ 'regex': r'upics\.yandex\.net\/(?P\d+)[\s\S]+(?P.+?)<\/span>\s+(?P.+?)<\/em>([\s\S]+?class="link">(?P.+?)<\/a>)?([\s\S]+?)?', }, 'Yandex O': { - 'flags': ['"cookiesDomain":".o.yandex.ru"'], + 'flags': [''], # NOT PRESENT 'regex': r'', + 'regex': r'', 'extract_json': True, 'transforms': [ json.loads, @@ -535,7 +536,7 @@ 'bio': lambda x: x.get('biography'), 'business_email': lambda x: x.get('business_email'), 'external_url': lambda x: x.get('external_url'), - 'facebook_uid': lambda x: x.get('fbid'), + 'facebook_uid': lambda x: x.get('fbid'), 'is_business': lambda x: x.get('is_business_account'), 'is_joined_recently': lambda x: x.get('is_joined_recently'), 'is_private': lambda x: x.get('is_private'), @@ -545,7 +546,7 @@ } }, 'Instagram API': { - 'flags': ['{"user":{"username"', 'profile_pic_url'], + 'flags': ['{"user":{"pk"', 'profile_pic_url'], 'regex': r'^(.+?)$', 'extract_json': True, 'fields': { @@ -571,7 +572,7 @@ 'bio': lambda x: x.get('biography'), 'business_email': lambda x: x.get('business_email'), 'external_url': lambda x: x.get('external_url'), - 'facebook_uid': lambda x: x.get('fbid'), + 'facebook_uid': lambda x: x.get('fbid'), 'is_business': lambda x: x.get('is_business_account'), 'is_joined_recently': lambda x: x.get('is_joined_recently'), 'is_private': lambda x: x.get('is_private'), @@ -618,7 +619,7 @@ 'extract_json': True, 'transforms': [ json.loads, - lambda x: [v for k,v in x.items() if k.startswith('User:')][0], + lambda x: [v for k, v in x.items() if k.startswith('User:')][0], json.dumps, ], 'fields': { @@ -829,7 +830,7 @@ }, 'Bitbucket': { 'flags': ['https://api.bitbucket.org'], - 'regex': r'({.+?"section": {"profile.+?"whats_new_feed":.+?}});', + 'regex': r'({.+?"section": {"profile.+?"repositories":.+?}});', 'extract_json': True, 'transforms': [ json.loads, @@ -920,7 +921,7 @@ } }, 'Reddit': { - 'flags': ['(.*)