In [2]:
import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import random
import urllib.request
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Defining the columns that needs to be scraped from the website

columns = ['Year','Name','Positions','Age','Rating','Potential','Team','Contract','ID','Height','Weight','Foot','Best Overall','Best Position'
           ,'Growth','Joined','Loan End Date','Value','Wage','Release Clause','Total Attacking','Crossing','Finishing',
           'Heading Accuracy','Short Passing','Volleys','Total Skill','Dribbling','Curve','FK Accuracy','Long Passing',
          'Ball Control','Total Movement','Acceleration','Sprint Speed','Agility','Reactions','Balance','Total Power',
          'Short Power','Jumping','Stamina','Strength','Long Shots','Total Mentality','Aggression','Interceptions',
           'Positioning','Vision','Penalties','Composure','Total Defending','Marking','Standing Tackle','Sliding Tackle',
          'Total GK','GK Diving','GK Handling','GK Kicking','GK Positioning','GK Reflexes','Total Stats','Base Stats',
           'Weak Foot','Skill Moves','Attacking Work Rate','Defensive Work Rate','International Reputation','PAC','SHO',
           'PAS','DRI','DEF','PHY','HITS']

In [17]:
def scrape_players_details(year):
    """
    Scrapes data from SoFiFa.com website for each year from 2008 to 2020 and creates a new csv file for each year
    
    Parameters:
    year  : the year for which data needs to be scraped
    
    """
     
    year_split = year[2:]
    base_url = f"https://sofifa.com/players?col=oa&sort=desc&showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=hi&showCol%5B3%5D=wi&showCol%5B4%5D=pf&showCol%5B5%5D=oa&showCol%5B6%5D=pt&showCol%5B7%5D=bo&showCol%5B8%5D=bp&showCol%5B9%5D=gu&showCol%5B10%5D=jt&showCol%5B11%5D=le&showCol%5B12%5D=vl&showCol%5B13%5D=wg&showCol%5B14%5D=rc&showCol%5B15%5D=ta&showCol%5B16%5D=cr&showCol%5B17%5D=fi&showCol%5B18%5D=he&showCol%5B19%5D=sh&showCol%5B20%5D=vo&showCol%5B21%5D=ts&showCol%5B22%5D=dr&showCol%5B23%5D=cu&showCol%5B24%5D=fr&showCol%5B25%5D=lo&showCol%5B26%5D=bl&showCol%5B27%5D=to&showCol%5B28%5D=ac&showCol%5B29%5D=sp&showCol%5B30%5D=ag&showCol%5B31%5D=re&showCol%5B32%5D=ba&showCol%5B33%5D=tp&showCol%5B34%5D=so&showCol%5B35%5D=ju&showCol%5B36%5D=st&showCol%5B37%5D=sr&showCol%5B38%5D=ln&showCol%5B39%5D=te&showCol%5B40%5D=ar&showCol%5B41%5D=in&showCol%5B42%5D=po&showCol%5B43%5D=vi&showCol%5B44%5D=pe&showCol%5B45%5D=cm&showCol%5B46%5D=td&showCol%5B47%5D=ma&showCol%5B48%5D=sa&showCol%5B49%5D=sl&showCol%5B50%5D=tg&showCol%5B51%5D=gd&showCol%5B52%5D=gh&showCol%5B53%5D=gc&showCol%5B54%5D=gp&showCol%5B55%5D=gr&showCol%5B56%5D=tt&showCol%5B57%5D=bs&showCol%5B58%5D=wk&showCol%5B59%5D=sk&showCol%5B60%5D=aw&showCol%5B61%5D=dw&showCol%5B62%5D=ir&showCol%5B63%5D=pac&showCol%5B64%5D=sho&showCol%5B65%5D=pas&showCol%5B66%5D=dri&showCol%5B67%5D=def&showCol%5B68%5D=phy&r={year_split}0002&set=true&offset="
    df = pd.DataFrame(columns=columns)
    
    offset = 0
    for offset in range(300):
        url = base_url + str(offset*60)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        table_body = soup.find('tbody')
        for row in table_body.findAll('tr'):
            td = row.findAll('td')
            year = year
            name = td[1].findAll('a')[1].text.strip()
            positions = ','.join(pos.text.strip() for pos in td[1].findAll('a',{'rel': 'nofollow'}))[1:]
            age = td[2].text.strip()
            rating = td[3].text.strip()
            potential = td[4].text.strip()
            team = td[5].find('a').text.strip()
            contract = td[5].find('div',{'class':'sub'}).text.strip()
            pid = td[6].text.strip()
            height = td[7].text.strip()
            weight = td[8].text.strip()
            foot = td[9].text.strip()
            bestOverall = td[10].text.strip()
            bestPosition = td[11].text.strip()
            growth = td[12].text.strip()
            joined = td[13].text.strip()
            loanEndDate = td[14].text.strip()
            value = td[15].text.strip()
            wage = td[16].text.strip()
            releaseClause = td[17].text.strip()
            totalAttacking = td[18].text.strip()
            crossing = td[19].text.strip()
            finishing = td[20].text.strip()
            headingAccuray = td[21].text.strip()
            shortPassing = td[22].text.strip()
            volleys = td[23].text.strip()
            totalSkill = td[24].text.strip()
            dribbling = td[25].text.strip()
            curve = td[26].text.strip()
            fkAccuracy = td[27].text.strip()
            longPassing = td[28].text.strip()
            ballControl = td[29].text.strip()
            totalMovement = td[30].text.strip()
            acceleration = td[31].text.strip()
            sprintSpeed = td[32].text.strip()
            agility = td[33].text.strip()
            reactions = td[34].text.strip()
            balance = td[35].text.strip()
            totalPower = td[36].text.strip()
            shortPower = td[37].text.strip()
            jumping = td[38].text.strip()
            stamina = td[39].text.strip()
            strength = td[40].text.strip()
            longShots = td[41].text.strip()
            totalMentality = td[42].text.strip()
            aggression = td[43].text.strip()
            interceptions = td[44].text.strip()
            positioning = td[45].text.strip()
            vision = td[46].text.strip()
            penalties = td[47].text.strip()
            composure = td[48].text.strip()
            totalDefending = td[49].text.strip()
            marking = td[50].text.strip()
            standingTackle = td[51].text.strip()
            slidingTackle = td[52].text.strip()
            totalGoalKeeping = td[53].text.strip()
            GKDiving = td[54].text.strip()
            GKHandling = td[55].text.strip()
            GKKicking = td[56].text.strip()
            GKPositioning = td[57].text.strip()
            GKReflexes = td[58].text.strip()
            totalStats = td[59].text.strip()
            baseStats = td[60].text.strip()
            weakFoot = td[61].text.strip()
            skillMoves = td[62].text.strip()
            attWorkRate = td[63].text.strip()
            defWorkRate = td[64].text.strip()
            intRep = td[65].text.strip()
            pac = td[66].text.strip()
            sho = td[67].text.strip()
            pas = td[68].text.strip()
            dri = td[69].text.strip()
            de = td[70].text.strip()
            phy = td[71].text.strip()
            hits = td[72].text.strip()
            player_data = pd.DataFrame([[year,name,positions,age,rating,potential,team,contract,pid,height,weight,foot,bestOverall,bestPosition,
                             growth,joined,loanEndDate,value,wage,releaseClause,totalAttacking,crossing,finishing,headingAccuray,
                             shortPassing,volleys,totalSkill,dribbling,curve,fkAccuracy,longPassing,ballControl,totalMovement,
                              acceleration,sprintSpeed,agility,reactions,balance,totalPower,shortPower,jumping,stamina,strength,
                             longShots,totalMentality,aggression,interceptions,positioning,vision,penalties,composure,
                              totalDefending,marking,standingTackle,slidingTackle,totalGoalKeeping,GKDiving,GKHandling,GKKicking,
                             GKPositioning,GKReflexes,totalStats,baseStats,weakFoot,skillMoves,attWorkRate,defWorkRate,intRep,
                             pac,sho,pas,dri,de,phy,hits]])
            player_data.columns = columns
            df = df.append(player_data, ignore_index=True)

        offset+=1
        if (offset % 20 == 0):
            print(offset)
    df.to_csv(f'player_data_{year}.csv', encoding='utf-8-sig')

In [8]:
for yr in range(2008,2021):
    scrape_players_details(str(yr))

2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


In [None]:
# Concatenating all the player csv files

import glob

path = r'C:\Shubhanshu\NEU\SML\Project' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('player_data_complete.csv', encoding='utf-8-sig')

### Scraping goal and cards data for players for predicting their value for the next season

In [1]:
# Defining the columns that needs to be scraped
cols = ['Year','ID','Name','Season','Team','Competition','MinutesPlayed','Appearances','Lineups','SubstituteIn',
        'SubstituteOut','SubstituteBench','Goal','YellowCard','RedCard']

In [3]:
def scrape_player_season_details(year):
    """
    Scrapes season data for players from SoFiFa.com website for each year from 2008 to 2020 and creates a new csv file for each year
    
    Parameters:
    year  : the year for which data needs to be scraped
    
    """
     
    year_split = year[2:]
    base_url = f"https://sofifa.com/?r={year_split}0075&set=true?offset="
    df = pd.DataFrame(columns=cols)
    
    offset = 0
    for offset in range(300):
        url = base_url + str(offset*60)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text)
        table_body = soup.find('tbody')
        for row in table_body.findAll('tr'):
            td = row.findAll('td')
            name = td[1].findAll('a')[0].text.strip()
            pid = td[6].text.strip()
            link = td[1].findAll('a')[0].get('href')
            link = '/'.join(link.split('/')[:-2]) + "/live" 
            html = requests.get("https://sofifa.com" + link)
            txt = html.text
            soup1 = BeautifulSoup(txt)
            tab_body = soup1.find('tbody')
        
            if(tab_body is None):
                player_data = pd.DataFrame([[year,pid,name,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,np.NaN,
                                        np.NaN,np.NaN,np.NaN,np.NaN,np.NaN]])
                player_data.columns = cols
                df = df.append(player_data, ignore_index=True)
                continue
        
        
            for goal in tab_body.findAll('tr'):
                tdI = goal.findAll('td')
                year = year
                season = tdI[0].text.strip()
                split = season.split('/')[0]
                if(split == year):
                    team = tdI[1].find('a').text.strip()
                    comp = tdI[2].find('a').text.strip()
                    minutesPlayed = tdI[3].text.strip()
                    appearances = tdI[4].text.strip()
                    lineups = tdI[5].text.strip()
                    subsituteIn = tdI[6].text.strip()
                    subsituteOut = tdI[7].text.strip()
                    subsituteBench = tdI[8].text.strip()
                    goal = tdI[9].text.strip()
                    yellowCard = tdI[10].text.strip()
                    redCard = tdI[12].text.strip()
                    player_data = pd.DataFrame([[year,pid,name,season,team,comp,minutesPlayed,appearances,lineups,subsituteIn,
                                        subsituteOut,subsituteBench,goal,yellowCard,redCard]])
                    player_data.columns = cols
                    df = df.append(player_data, ignore_index=True)
    
        offset+=1
        if (offset % 20 == 0):
            print(offset)    
    df.to_csv(f'player_goaldata_{year}.csv', encoding='utf-8-sig')

In [None]:
for yr in range(2008,2021):
    scrape_player_season_details(str(yr))

In [None]:
# Concatenating all the player csv files

import glob

path = r'C:\Shubhanshu\NEU\SML\Project\Goal' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('player_goaldata_complete.csv', encoding='utf-8-sig')