In [18]:
import asyncio
import aiohttp
import pandas as pd
from bs4 import BeautifulSoup
import requests
import feather
import time
import random

In [9]:
async def _fetch(session, url):
    async with session.get(url, timeout=60 * 60) as response:
        return await response.text()


async def _fetch_all(session, urls, loop):
    results = await asyncio.gather(
        *[_fetch(session, url) for url in urls],
        return_exceptions=True  # so we can deal with exceptions later
    )

    return results

def _get_htmls(urls):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    if len(urls) > 1:
        loop = asyncio.get_event_loop()
        connector = aiohttp.TCPConnector(limit=100)
        with aiohttp.ClientSession(loop=loop, connector=connector) as session:
            htmls = loop.run_until_complete(_fetch_all(session, urls, loop))
        raw_result = dict(zip(urls, htmls))
    else:
        raw_result = {urls[0]: requests.get(urls[0], headers=headers).text}
    return raw_result

In [25]:
url = 'https://sofifa.com/players'
url_list = [url] + [url + '?offset=' + str(i*80) for i in range(1,250)]

In [27]:
pl_list = _get_htmls(url_list)

In [37]:
def player_list_from_html(url_text):
    soup = BeautifulSoup(url_text, 'html.parser')
    player_image_list = soup.findAll('img', attrs={'class': 'player-check'})
    player_image = [pim['data-src'] for pim in player_image_list]
    player_id = [pim['id'] for pim in player_image_list]
    player_details = soup.findAll('div', attrs={'class': 'col-name'})
    player_profile = [pld for i, pld in enumerate(player_details) if i % 2 == 0]
    club_details = [pld for i, pld in enumerate(player_details) if i % 2 == 1]
    player_country = [pld.findAll('a')[0]['title'] for pld in player_profile]
    player_club = [pld.findAll('a')[0].text.strip() for pld in club_details]
    player_name = [pld.findAll('a')[1].text.strip() for pld in player_profile]
    player_fname = [pld.findAll('a')[1]['title'] for pld in player_profile]
    player_url = ['https://sofifa.com' + pld.findAll('a')[1]['href'] for pld in player_profile]
    player_position_list = [pld.findAll('a')[2:] for pld in player_profile]
    player_position = []
    for pplist in player_position_list:
        player_position.append([elem.text.strip() for elem in pplist])


    player_overall_list = soup.findAll('div', attrs={'class': 'col-digit col-oa'})
    player_overall = [oa.find('span').text.strip() for oa in player_overall_list]
    player_poten_list = soup.findAll('div', attrs={'class': 'col-digit col-pt'})
    player_poten = [pt.find('span').text.strip() for pt in player_poten_list]
    player_age = [pa.text.strip() for pa in soup.findAll('div', attrs={'class': 'col-digit col-ae'})]
    player_value = [val.text.strip() for val in soup.findAll('div', attrs={'class': 'col-digit col-vl'})]
    player_wage = [wg.text.strip() for wg in soup.findAll('div', attrs={'class': 'col-digit col-wg'})]

    data_list = [player_id, player_name, player_fname, player_age, player_country, player_club,
                                    player_image, player_position, player_overall, player_poten, player_value,
                                    player_wage, player_url]

    player_df = pd.DataFrame(list(zip(*data_list)), columns=col_names)

    return player_df

In [39]:
col_names = ['ID', 'Name', 'FullName', 'Age', 'Nationality', 'Club', 'Image', 'Position', 'Overall', 'Potential',
             'Value', 'Wage', 'URL']
data = pd.DataFrame(columns=col_names)

for elem in pl_list.values():
    data = data.append(player_list_from_html(elem), ignore_index=True)

In [45]:
data['Position'] = data.Position.apply(lambda x: ', '.join(x))

In [50]:
data = data.drop_duplicates()
data.reset_index(inplace=True)

In [51]:
data.shape

(18005, 14)

In [53]:
data.to_feather("player_list")
data.to_csv("player_list.csv", index=False)