# <center> SoFIFA Player Stats</center>
## <center>Using Scrapy and Jupyter Notebook to Download Player Stats from SoFIFA.com<center/>

For this project I used the scrapy tutorial from the docs (1) and the blog on JJ's world about using Scrapy in Jupyter notebook (2).
<p>(1) <a href=https://doc.scrapy.org/en/latest/intro/tutorial.html>https://doc.scrapy.org/en/latest/intro/tutorial.html</a></p>
<p>(2) <a href=https://www.jitsejan.com/using-scrapy-in-jupyter-notebook.html>https://www.jitsejan.com/using-scrapy-in-jupyter-notebook.html</a></p>

Please make sure to run the URL Spider first! You need to have a sofi_urls.jl file saved in the current directory for this to work!

In [1]:
import scrapy 
from scrapy.crawler import CrawlerProcess
import pandas as pd
import json

### Pipeline Setup

In [2]:
class JsonStatsWriterPipeline(object):
    
    def open_spider(self, spider):
        self.file = open('sofi_stats.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

## Stats Spider Setup

In [3]:
import re
import logging

class Sofi_Stats_Spider(scrapy.Spider):
    
    name='stats_sofi'
    
    # Load the Json file 
    data=[]
    with open("sofi_urls.jl", "r") as read_file:
        for line in read_file:
            data.append(json.loads(line))
    
    start_urls=[x['url'] for x in data] # use our links in the json file
    
    
    
    custom_settings = {
        'LOG_LEVEL':logging.WARNING,
        'ITEM_PIPELINES':{'__main__.JsonStatsWriterPipeline': 1},
        #'FEED_FORMAT':'json',
        #'FEED_URI':'sofi_stats.json',  # Uncomment these if you want json rather than jl files
    }
    
    
    def parse(self,response):
        
        sd={} # Create the dictionary
        
        # Name and SoFIFA ID
        name,ID=response.css('.info h1::text').getall()[0].split('(')
        name=name[:-1]
        ID=ID[4:-2]
        
        sd['name']=name
        sd['id']=ID
        
        # Full Name, Country, Position(s)
        sd['full_name'] = response.css('.bp3-text-overflow-ellipsis::text').get()[:-1]
        sd['country'] = response.css('.bp3-text-overflow-ellipsis a::attr(title)').get()
        sd['positions'] = response.css('.meta.bp3-text-overflow-ellipsis span').re('>(.*)<')

        # Age, DOB, Height, Weight
        helper = response.css('.meta.bp3-text-overflow-ellipsis::text').getall()[-1].split()

        sd['age']=int(helper[1])
        dob=helper[2]+' '+helper[3]+' '+helper[4]
        sd['dob']=dob[1:-1]

        ht=helper[5]
        ht=ht.split(ht[1])
        ht[1]=ht[1].replace('"','')
        sd['height']=int(ht[0])*12+int(ht[1])


        sd['weight']=int(helper[6][:3])
    
        # Overall Rating, Potential, Value, Wage

        helper = response.css('.column.col-4.text-center span::text').getall()
        for x in helper:
            if x[0]=='+':
                helper.remove(x)

        sd['overall']=int(helper[0])
        sd['potential']=int(helper[1])
        sd['value'],sd['wage']=helper[2:4]
        
        # Preferred Foot, International Reputation, Weak Foot, Skill Moves

        sd['preferred_foot'],int_rep,weak_foot,skill_moves = response.css('.column.col-6 ul.bp3-text-overflow-ellipsis.pl li::text').getall()[1:8:2]
        sd['int_rep']=int(int_rep)
        sd['weak_foot']=int(weak_foot)
        sd['skill_moves']=int(skill_moves)
        
        # Work Rate, Body Type, Real Face, Release Clause
        
        helper=response.css('.column.col-6 ul.bp3-text-overflow-ellipsis.pl li span::text').getall()

        sd['work_rate']=helper[0]
        sd['body_type']=helper[1]
        sd['real_face']=helper[2]
        try:
            sd['release_clause'] = helper[3]
        except:
            sd['release_clause']=None
            
        # Club, Club Rating, Club Position, Jersey Number

        try: 
            sd['club']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li h6').re('\">(.*)</a>')[0]
        except:
            sd['club']=None

        try:
            sd['club_rating']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li span::text').getall()[0]
            sd['club_position']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li span::text').getall()[1]
            sd['jersey_number']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li::text').getall()[1]
            #sd['joined']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li::text').getall()[2]
            #sd['contract_expir']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li::text').getall()[3]
        except:
            sd['club_rating']=None
            sd['club_position']=None
            sd['jersey_number']=None 
            
        # Country, country_rating, country_position, country_jersey
        # NOT ALL PLAYERS PLAY FOR THE NATIONAL TEAM 

        try: 
            sd['national_team']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li h6').re('\">(.*)</a>')[1]
        except: 
            sd['national_team']=None
        try:
            sd['nt_rating']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li span::text').getall()[2]
        except:
            sd['nt_rating']=None    
        try:    
            sd['nt_position']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li span::text').getall()[3]
        except:
            sd['nt_position']=None    
        try:    
            sd['nt_jersey']=response.css('.bp3-text-overflow-ellipsis.pl.text-right li::text').getall()[5]
        except:
            sd['nt_jersey']=None

        # All Skill Attributes
        
        helper=response.css('ul li span::text').getall()
  
        # We need to remove any notices that the attribute has increased

        for x in helper:
            if x[0]=='+':
                helper.remove(x)
            elif x[0]=='-':
                helper.remove(x)
        
        att_list=helper[(helper.index('Crossing')-1):]
        
        fixed_list = att_list[:51]
        fixed_list.append('Composure')
        fixed_list.extend(att_list[51:58])
        fixed_list.append('GK Diving')
        fixed_list.append(att_list[58])
        fixed_list.append('GK Handling')
        fixed_list.append(att_list[59])
        fixed_list.append('GK Kicking')
        fixed_list.append(att_list[60])
        fixed_list.append('GK Positioning')
        fixed_list.append(att_list[61])
        fixed_list.append('GK Reflexes')
        fixed_list.extend(att_list[62:])


        for i in range(int((len(fixed_list[:68]))/2)):
             sd[fixed_list[2*i+1].lower()]=int(fixed_list[2*i])
                
        sd['traits']=fixed_list[68:]
        
        yield sd
        

### Start the Crawler for the Stats

In [4]:
process = CrawlerProcess({'USER_AGENT':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)'})
process.crawl(Sofi_Stats_Spider)
process.start()

2019-11-19 22:44:57 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2019-11-19 22:44:57 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.4 (default, Aug 13 2019, 15:17:50) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1c  28 May 2019), cryptography 2.7, Platform Darwin-19.0.0-x86_64-i386-64bit
2019-11-19 22:44:57 [scrapy.crawler] INFO: Overridden settings: {'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)'}


### Move the file to the data folder

In [1]:
!mv sofi_stats.jl data