# Final Code

## Dictionaries to associate state, division, and region codes with states, according to the abbreviations used by US Soccer.

Below is a manually created dictionary to turn US Soccer state abbreviations into the proper name for the state.

In [7]:
stateabbreviations = {
    'Ala.': 'Alabama',
    'Alaska': 'Alaska',
    'Ariz.': 'Arizona',
    'Ark.': 'Arkansas',
    'Calif.': 'California',
    'Colo.': 'Colorado',
    'Conn.': 'Connecticut',
    'Del.': 'Delaware',
    'D.C.': 'District of Columbia',
    'Fla.': 'Florida',
    'Ga.': 'Georgia',
    'Hawaii': 'Hawaii',
    'Idaho': 'Idaho',
    'Ill.': 'Illinois',
    'Ind.': 'Indiana',
    'Iowa': 'Iowa',
    'Kan.': 'Kansas',
    'Ky.': 'Kentucky',
    'La.': 'Louisiana',
    'Maine': 'Maine',
    'Md.': 'Maryland',
    'Mass.': 'Massachusetts',
    'Mich.': 'Michigan',
    'Minn.': 'Minnesota',
    'Miss.': 'Mississippi',
    'Mo.': 'Missouri',
    'Mont.': 'Montana',
    'Neb.': 'Nebraska',
    'Nev.': 'Nevada',
    'N.H.': 'New Hampshire',
    'N.J.': 'New Jersey',
    'N.M.': 'New Mexico',
    'N.Y.': 'New York',
    'N.C.': 'North Carolina',
    'N.D.': 'North Dakota',
    'Ohio': 'Ohio',
    'Okla.': 'Oklahoma',
    'Ore.': 'Oregon',
    'Pa.': 'Pennsylvania',
    'R.I.': 'Rhode Island',
    'S.C.': 'South Carolina',
    'S.D.': 'South Dakota',
    'Tenn.': 'Tennessee',
    'Texas': 'Texas',
    'Utah': 'Utah',
    'Vt.': 'Vermont',
    'Va.': 'Virginia',
    'Wash.': 'Washington',
    'W.V.': 'West Virginia',
    'Wis.': 'Wisconsin',
    'Wyo.': 'Wyoming'   
}  

Next I used a csv file of census codes (geocodes.csv) to generate a dictionary that provides the state, division, and region codes for each state.

Census codes file downloaded from https://www2.census.gov/programs-surveys/popest/geographies/2016/state-geocodes-v2016.xls

In [9]:
import csv

geocodes = []
with open("geocodes.csv", "r", newline = "") as infile:
    csvin = csv.reader(infile)
    geoheaders = next(csvin)
    for line in csvin:
        geocodes.append(line)

name = geoheaders.index('Name')
state = geoheaders.index('State (FIPS)')
division = geoheaders.index('Division')
region = geoheaders.index('\ufeffRegion')

codedict = {}

for statecode in geocodes:
    codedict[statecode[name]] = {'State': statecode[state], 'Division': statecode[division], 'Region': statecode[region]}
    


## Read in player data and locate census codes and population for the hometowns of each player.

Documentation for playerdata.csv located at: 

Documentation for censusplace.csv located at: 

In [23]:
import csv

# Read in the playerdata csv
playerinfo = []
with open("2018Data/USWNT/playerdata.csv", "r", newline = "") as infile:
    csvin = csv.reader(infile)
    playerheaders = next(csvin)
    for line in csvin:
        playerinfo.append(line)

hometown = playerheaders.index("Hometown") # Index the column that contains player hometowns

# Read in the census csv    
census = []
with open("2018Data/Census/censusplace.csv", "r", newline = "", encoding = 'latin-1') as infile:
    csvin = csv.reader(infile)
    censusheaders = next(csvin)
    for line in csvin:
        census.append(line) 

state_code = censusheaders.index("Geo_STATE")  # Index columns for state code, place name, and population
place_name = censusheaders.index("Geo_NAME")
population = censusheaders.index("SE_T001_001")


for player in playerinfo:
    splithometown = player[hometown].split(",") # Split hometown into town name and state
    state = splithometown[1].strip()
    playerdata_state = codedict[stateabbreviations[state]]['State']  # For each player, find the state, division, and region code for their state
    playerdata_division = codedict[stateabbreviations[state]]['Division']
    playerdata_region = codedict[stateabbreviations[state]]['Region']
    
    player.append(playerdata_state) # Add census codes for each player's state into the row of data
    player.append(playerdata_division)
    player.append(playerdata_region)

    townname = splithometown[0].strip()
    if townname == "St. Simons Island": # Exemptions for player town names that are listed differently in the census
        townname = "St. Simons"
    if townname == "Ventura":
        townname = "San Buenaventura"
    
    for place in census:
        censusstate = place[state_code]
        censushometown = place[place_name]
        if censushometown.startswith(townname) == True and censusstate == playerdata_state: # Searching only within the player's state, find matches to the name of the hometown, using startswith because of extraneous words at the end of the name
            if censushometown == "Mesa del Caballo CDP": # Filters out a duplicate match for a hometown
                pass
            else:
                player.append(place[population])
        else:
            pass     
        
playerheaders.append("State Code")
playerheaders.append("Division Code")
playerheaders.append("Region Code")
playerheaders.append("Hometown Population")

outfile = open('interimfile.csv', 'w')
csv_out = csv.writer(outfile)
csv_out.writerow(playerheaders)
csv_out.writerows(playerinfo)
outfile.close()

## Read in college data and merge it into the playerdata set to create a final dataset.

Note: this code is yet to come, since I have to do some editing of the process on the back end to make it possible. 