# Gathering player Wikipedia data

## Part 1: Player Wikipedia pages

In [4]:
import csv
import requests
import time

playerinfo = []
with open("USWNT/playerdata.csv", "r", newline = "") as infile:
    csvin = csv.reader(infile)
    headers = next(csvin)
    for line in csvin:
        playerinfo.append(line)

filename = headers.index("Player Filename")
firstname = headers.index("Firstname")
surname = headers.index("Surname")

wikidata = []

for singleplayer in playerinfo:
    wikidatarow = []
    playerfilename = singleplayer[filename]
    playerfirstname = singleplayer[firstname]
    playersurname = singleplayer[surname]

    # Make Players' Wikipedia URLs
    if playersurname == "Campbell" or playersurname == "Smith" or playersurname == "Williams" or playersurname == "Fox":
        wikiplayerurl = "https://en.wikipedia.org/wiki/" + playerfirstname + "_" + playersurname + "_(soccer)"
    else:
        wikiplayerurl = "https://en.wikipedia.org/wiki/" + playerfirstname + "_" + playersurname
    
    # Make filenames to write the HTML from the Wikipedia pages
    last_slash = wikiplayerurl.rindex("/")
    playerwikifilename = "PlayerWiki" + wikiplayerurl[last_slash:] + ".html"
    
    # Request HTML from Wikipedia
    outfile = open(playerwikifilename, "w")
    result = requests.get(wikiplayerurl)
    print(result.text, file=outfile)
    outfile.close()
    time.sleep(2)
    
    # Collect player names and filenames to keep track of what's where
    wikidatarow.append(playerfilename)
    wikidatarow.append(playerfirstname)
    wikidatarow.append(playersurname)
    wikidatarow.append(playerwikifilename)
    
    wikidata.append(wikidatarow)

# Write it out to a csv for preservation and accountability
wikidataheaders = ["Player Filename", "Firstname", "Surname", "PlayerWiki Filename"]

outfile = open('PlayerWiki/playerwikidata.csv', 'w')
csv_out = csv.writer(outfile)
csv_out.writerow(wikidataheaders)
csv_out.writerows(wikidata)
outfile.close()   

## Part 2: Finding player college teams and the Wikipedia page for the teams

In [5]:
from lxml import html

wikidata = []
with open('PlayerWiki/playerwikidata.csv', "r", newline = "") as infile:
    csvin = csv.reader(infile)
    wikidataheaders = next(csvin)
    for line in csvin:
        wikidata.append(line)

playerwikihtml = wikidataheaders.index("PlayerWiki Filename")

for playerwiki in wikidata:
    playerwikihtmlfile = playerwiki[playerwikihtml]
    infile = open(playerwikihtmlfile, "r")
    page = infile.read()
    tree = html.fromstring(page)
    
    headers = tree.xpath('//th[contains(@style,"background-color: #b0c4de")]') # Identify every header, made distinct by the background color.

    counter = 0
    for element in headers: # Circle through the headers
        if element.text == "College career": # Find which header has the college career
            collegelocation = headers[counter]
            nextheader = headers[counter + 1] # Identify whatever is the next header after college
        else:
            pass
        counter = counter + 1
    try:
        collegehtml = page[page.index(collegelocation.text):page.index(nextheader.text)] # Only look at the text between the two headers
        littletree=html.fromstring(collegehtml)
        collegelink = littletree.xpath('//a/@href') # Find the links within the college career section
        for link in collegelink:
            if link.startswith("#") == True: # Weed out any cite notes or other comments that aren't actual links
                pass
            else:
                playerwiki.append(link[link.rindex('/'):])
                
    except:
        playerwiki.append("NoCollege")
        


wikidataheaders.append("First College Team Link")
wikidataheaders.append("Second College Team Link")
outfile = open('Collegeteam/collegeteamlinks.csv', 'w')
csv_out = csv.writer(outfile)
csv_out.writerow(wikidataheaders)
csv_out.writerows(wikidata)
outfile.close()    

In [12]:
import requests
import time

wikidata = []
with open('Collegeteam/collegeteamlinks.csv', 'r', newline = "") as infile:
    csvin = csv.reader(infile)
    wikidataheaders = next(csvin)
    for line in csvin:       
        wikidata.append(line)
        
firstcollegeteam = wikidataheaders.index("First College Team Link") # Index first and second college teams, since some players attended two schools
secondcollegeteam = wikidataheaders.index("Second College Team Link")

for playerwiki in wikidata:
    try:
        secondcollege = playerwiki[secondcollegeteam] # If there's an indexing error, it means there isn't a second college, so add in a null value
    except:
        playerwiki.append("N/A")
    
    firstcollege = playerwiki[firstcollegeteam]
    secondcollege = playerwiki[secondcollegeteam]
    
    if firstcollege == "NoCollege": # Don't try to do anything if the player didn't go to college
        playerwiki.append("N/A")
    else:
        firstlinkbuild = "https://en.wikipedia.org/wiki" + firstcollege
        outfile = open('Collegeteam' + firstcollege + '.html', 'w')
        
        result = requests.get(firstlinkbuild) # Request the college team page from wikipedia
        print(result.text, file=outfile)
        time.sleep(2)
        outfile.close()
    
    if secondcollege == "N/A":
        pass
    else:
        secondlinkbuild = "https://en.wikipedia.org/wiki" + secondcollege # If a player went to a second college, read in the page for their second team
        outfile = open('Collegeteam' + secondcollege + '.html', 'w')
        
        result = requests.get(secondlinkbuild)
        print(result.text, file=outfile)
        time.sleep(2)
        outfile.close()
    
outfile = open('Collegeteam/collegeteamlinks.csv', 'w')
csv_out = csv.writer(outfile)
csv_out.writerow(wikidataheaders)
csv_out.writerows(wikidata)
outfile.close() 

## Part 3: Finding the Wiki page for the actual university

In [13]:
from lxml import html

wikidata = []
with open('Collegeteam/collegeteamlinks.csv', 'r', newline = "") as infile:
    csvin = csv.reader(infile)
    wikidataheaders = next(csvin)
    for line in csvin:       
        wikidata.append(line)

firstcollegeteam = wikidataheaders.index("First College Team Link")
secondcollegeteam = wikidataheaders.index("Second College Team Link")

wikidataheaders.append("First College")
wikidataheaders.append("Second College")

for playerwiki in wikidata:
    
    firstcollege = playerwiki[firstcollegeteam]
    secondcollege = playerwiki[secondcollegeteam]        
        
    if firstcollege == "NoCollege":
        playerwiki.append("N/A")
    else:
        infile = open('Collegeteam' + firstcollege + ".html", 'rb')
        page = infile.read()
        tree=html.fromstring(page)    
        college = tree.xpath('//tr[contains(th,"University")]/td/a/@href') # Identify the link to the university's page
        if college == []:
            playerwiki.append('Collegewiki' + firstcollege)
        else:
            for link in college:
                collegelink = 'Collegewiki' + link[link.rindex('/'):] # Removes the /wiki from the beginning of the url
                playerwiki.append(collegelink)
        
    if secondcollege == "N/A":
        playerwiki.append("N/A")
    else:
        infile = open('Collegeteam' + secondcollege + ".html", 'rb')
        page = infile.read()
        tree=html.fromstring(page)    
        college = tree.xpath('//tr[contains(th,"University")]/td/a/@href') # Identify the link to the second university's page
        for link in college:
            collegelink = 'Collegewiki' + link[link.rindex('/'):] # Removes the /wiki from the beginning of the url
            playerwiki.append(collegelink)
        

outfile = open('Collegewiki/collegelinks.csv', 'w')
csv_out = csv.writer(outfile)
csv_out.writerow(wikidataheaders)
csv_out.writerows(wikidata)
outfile.close()    


In [14]:
wikidata = []
with open("Collegewiki/collegelinks.csv", "r", newline = "") as infile:
    csvin = csv.reader(infile)
    wikidataheaders = next(csvin)
    for line in csvin:
        wikidata.append(line)

firstcollege = wikidataheaders.index("First College")
secondcollege = wikidataheaders.index("Second College")

for playerwiki in wikidata:
    firstcollegelink = playerwiki[firstcollege]
    secondcollegelink = playerwiki[secondcollege]
    
    if firstcollegelink == "N/A": # Read in the first college's page, if applicable
        pass
    else:
        firstcollegeurl = 'https://wikipedia.org/wiki' + firstcollegelink[firstcollegelink.rindex('/'):]
        
        outfile = open(firstcollegelink + '.html', 'w')
        result = requests.get(firstcollegeurl)
        print(result.text, file=outfile)
        time.sleep(2)
        outfile.close()
    
    if secondcollegelink == "N/A": # Read in the second college's page, if applicable
        pass
    else:
        secondcollegeurl = 'https://wikipedia.org/wiki' + secondcollegelink[secondcollegelink.rindex('/'):]
        
        outfile = open(secondcollegelink + '.html', 'w')
        result = requests.get(secondcollegeurl)
        print(result.text, file=outfile)
        time.sleep(2)
        outfile.close()        


In [15]:
wikidata = []
with open("Collegewiki/collegelinks.csv", "r", newline = "") as infile:
    csvin = csv.reader(infile)
    wikidataheaders = next(csvin)
    for line in csvin:
        wikidata.append(line)

firstcollege = wikidataheaders.index("First College")
secondcollege = wikidataheaders.index("Second College")

wikidataheaders.append("First College Name") # Adding headers to the csv for all the data I'm going to collect
wikidataheaders.append("First College Public")
wikidataheaders.append("First College Private")
wikidataheaders.append("First College Community")
wikidataheaders.append("First College Location")
wikidataheaders.append("First College Enrollment")

wikidataheaders.append("Second College Name")
wikidataheaders.append("Second College Public")
wikidataheaders.append("Second College Private")
wikidataheaders.append("Second College Community")
wikidataheaders.append("Second College Location")
wikidataheaders.append("Second College Enrollment")


for playerwiki in wikidata:
    firstcollegelink = playerwiki[firstcollege]
    secondcollegelink = playerwiki[secondcollege]
    
    if firstcollegelink == "N/A": # Add null values to fill the space that won't have data to fill
        for loop in range(5):
            playerwiki.append("N/A")

    else:    
        infile = open(firstcollegelink + '.html', 'rb') 
        page = infile.read()
        tree=html.fromstring(page)    
    
        schoolname = tree.xpath('//h1/text()') # Identify the name of the school
        
        if tree.xpath('//tr[contains(th,"Type")]/td/a[contains(text(),"Public")]/text()') == []: # Is it a public university?
            publiccollege = 'n'
        else:
            publiccollege = 'y'
        
        if tree.xpath('//tr[contains(th,"Type")]/td/a[contains(text(),"Private")]/text()') == []: # Is it a private university?
            privatecollege = 'n'
        else:
            privatecollege = 'y'
            
        if tree.xpath('//tr[contains(th,"Type")]/td/a[contains(text(),"Community")]/text()') == []: # Is it a community college?
            communitycollege = 'n'
        else:
            communitycollege = 'y'
            
        students = tree.xpath('//tr[contains(th,"Students")]/td/text()') # Identify the number of students
        location = tree.xpath('//tr[contains(th,"Location")]/td/div/a/text()') # Identify the location
    
        playerwiki.append(",".join(schoolname))
        playerwiki.append(publiccollege)
        playerwiki.append(privatecollege)
        playerwiki.append(communitycollege)
        playerwiki.append(",".join(location))
        playerwiki.append(",".join(students))


    if secondcollegelink == "N/A": # Add null values for anyone who didn't go to a second college
        for loop in range(5):
            playerwiki.append("N/A")

    else:    
        infile = open(secondcollegelink + '.html', 'rb')
        page = infile.read()
        tree=html.fromstring(page)    
    
        schoolname = tree.xpath('//h1/text()') # Identify the name of the second school
        
        if tree.xpath('//tr[contains(th,"Type")]/td/a[contains(text(),"Public")]/text()') == []: # Is it a public university?
            publiccollege = 'n'
        else:
            publiccollege = 'y'
        
        if tree.xpath('//tr[contains(th,"Type")]/td/a[contains(text(),"Private")]/text()') == []: # Is it a private university?
            privatecollege = 'n'
        else:
            privatecollege = 'y'
            
        if tree.xpath('//tr[contains(th,"Type")]/td/a[contains(text(),"Community")]/text()') == []: # Is it a community college?
            communitycollege = 'n'
        else:
            communitycollege = 'y'
            
        students = tree.xpath('//tr[contains(th,"Students")]/td/text()') # Identify the number of students
        location = tree.xpath('//tr[contains(th,"Location")]/td/div/a/text()') # Identify the location
    
        playerwiki.append(",".join(schoolname))
        playerwiki.append(publiccollege)
        playerwiki.append(privatecollege)
        playerwiki.append(communitycollege)

        playerwiki.append(",".join(location))
        playerwiki.append(",".join(students))


outfile = open('Collegewiki/collegedata.csv', 'w')
csv_out = csv.writer(outfile)
csv_out.writerow(wikidataheaders)
csv_out.writerows(wikidata)
outfile.close()           

## Documentation for this dataset can be found at: