# USWNT Player Data Project

## Rachel Bellavia

## Step 1:

Scrape the player names from the website to make URLS.

In [119]:
# Download the stats page as html

year = "2009"  # input the year for the stats you want

import requests
import time

url = "https://www.ussoccer.com/womens-national-team/stats/" + year + "-statistics"

outfile = open(year + "/statspage.html", "w")
result = requests.get(url)
print(result.text, file=outfile)
time.sleep(2)

In [126]:
# Create URLs from player names

year = "2015"  # input the year for the stats you want

from lxml import html
import string

def clean_line(line):
    cleaned_line = line
    for character in string.punctuation:
        cleaned_line = cleaned_line.replace(character, "")
    return cleaned_line

infile = open(year + "/statspage.html", "rb")
statshtml = infile.read()
infile.close()

tree = html.fromstring(statshtml)

if year == "2015":
    names = tree.xpath('//tbody/tr[(position()>1) and (position() < last()-1)]/td[1]/span/text()')
else:
    names = tree.xpath('//tbody/tr[(position()>1) and (position() < last()-1)]/td[1]/text()') # Pulls only the first column (player names), not including the last two lines, which are headers for totals

uniquenames = set(names) # Remove duplicate names since goalies appear twice

playernames = []
for name in uniquenames:
    if name.strip().lower() == 'own goal': # Filter out own goals from list of names
        pass
    elif name.strip().lower() == 'own goals received':
        pass
    elif name.strip().lower() == 'owngoal':
        pass
    elif name == '\n            ':
        pass
    else:
        playernames.append(name.lower().strip().replace(" ", "").split(",")) # Splits name into first and last and makes lowercase
        
outfile = open(year + '/urls.txt', 'w')

for lastname, firstname in playernames:
    if lastname == 'ertz': # Exemption for Julie Ertz, whose url is her maiden name
        playerurl = "https://www.ussoccer.com/players/j/julie-johnston"
        print(playerurl, file=outfile)
    elif lastname == 'vanhollebeke':
        playerurl = "https://www.ussoccer.com/players/b/rachel-buehler"
        print(playerurl, file=outfile)
    elif lastname == 'loyden':
        playerurl = "https://www.ussoccer.com/players/l/jillian-loyden"
        print(playerurl, file=outfile)
    elif firstname == 'leighann':
        playerurl = "https://www.ussoccer.com/players/r/lee-ann-robinson"
        print(playerurl, file=outfile)
    elif lastname == 'cheney':
        playerurl = "https://www.ussoccer.com/players/h/lauren-holiday"
        print(playerurl, file=outfile)
    elif lastname == 'kreiger':
        playerurl = "https://www.ussoccer.com/players/k/ali-krieger"
        print(playerurl, file=outfile)
    elif lastname == 'nogueira':
        playerurl = "https://www.ussoccer.com/players/n/casey-noguiera"
        print(playerurl, file=outfile)
    else:
        playerurl = "https://www.ussoccer.com/players/" + lastname[0] + "/" + clean_line(firstname) + "-" + clean_line(lastname)
        print(playerurl, file=outfile)
outfile.close()

## Step 2:

Ping the website to acquire files from player bio URLS.

In [None]:
# Loops through urls and saves html for each player bio page

year = "2015"  # input the year for the stats you want

import requests
import time
infile = open(year + '/urls.txt', "r")

list_of_urls = infile.read().split()

for url in list_of_urls:
    last_slash = url.rindex("/")
    filename = year + "/" + url[last_slash + 1:] + ".html" # Last slash pulls only the final part of the url, i.e. the player name, for the file name
    outfile = open(filename, "w")

    result = requests.get(url)
    print(result.text, file=outfile)
    time.sleep(2)

## Step 3:

Read in the html files and organize the desired information into a database.

In [123]:
year = "2015"  # input the year for the stats you want

from lxml import html

def checkFor1Result(xpathresult, missing_value): # Function to catch any bio information field that might be blank
    if len(xpathresult) > 1:
        howmany = len(xpathresult)
        raise ValueError("Your list had " + str(howmany) + " items instead of 1. Shutting down the program.")
    elif len(xpathresult) == 1:
        result = xpathresult[0] # grab the element when there is exactly one to grab
    else:
        result = missing_value
    return result

infile = open(year + "/urls.txt", "r")
list_of_urls = infile.read().split()
infile.close()

filenames = [] # Create the file names again, this time for reading instead of writing
for url in list_of_urls:
    last_slash = url.rindex("/")
    filename = year + "/" + url[last_slash + 1:] + ".html"
    filenames.append(filename)

allplayers = []
for playerfile in filenames:
    infile = open(playerfile, 'rb')
    playerhtml = infile.read()
    infile.close()

    tree = html.fromstring(playerhtml)

    # Save the info we want to a variable
    firstname = tree.xpath('//span[@itemprop = "givenName"]/text()') 
    lastname = tree.xpath('//span[@itemprop = "familyName"]/text()')
    position = tree.xpath('//span[@class = "bio-position"]/text()')

    bdate = tree.xpath('//div[starts-with(@class, "player-details")]//p//@datetime')
    height = tree.xpath('//div[starts-with(@class, "player-details")]//p/span/text()')
    hometown = tree.xpath('//div[starts-with(@class, "player-details")]//p[@itemprop = "address"]/text()')
    club = tree.xpath('//div[starts-with(@class, "player-details")]//p[@itemprop = "name"]/text()')

    allinfo = []
    
    # Put all of the info we want into a list
    allinfo.append(playerfile)
    allinfo.append(checkFor1Result(firstname, "NoFirstName"))
    allinfo.append(checkFor1Result(lastname, "NoLastName"))
    allinfo.append(checkFor1Result(position, "NoPosition"))

    allinfo.append(checkFor1Result(bdate, "NoBirthdate"))
    allinfo.append(checkFor1Result(height, "NoHeight"))
    allinfo.append(checkFor1Result(hometown, "NoHometown"))
    allinfo.append(checkFor1Result(club, "NoClub"))

    # Put the list into a list of lists
    allplayers.append(allinfo)

# Write out all the player info to a csv file    
import csv

player_headers = ['Filename','Firstname', 'Surname', 'Position','Birthdate', 'Height', 'Hometown', 'Club']

outfile = open(year + '/playerdata.csv', 'w')
csv_out = csv.writer(outfile)
csv_out.writerow(player_headers)
csv_out.writerows(allplayers)
outfile.close()

## Future Work

Go back through previous years and pull the player info for those years into their own datasets.

Then: do some stuff with it! 